[llvm] [AMDGPU] Enable unaligned scratch accesses (PR #110219)

Fabian Ritter via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 27 01:07:24 PDT 2024


https://github.com/ritter-x2a created https://github.com/llvm/llvm-project/pull/110219

This allows us to emit wide generic and scratch memory accesses when we do not have alignment information. In cases where accesses happen to be properly aligned or where generic accesses do not go to scratch memory, this improves performance of the generated code by a factor of up to 16x and reduces code size, especially when lowering memcpy and memmove intrinsics.

Also: Make the use of the FeatureUnalignedScratchAccess feature more consistent: Code has already assumed that unaligned accesses with the specialized flat scratch instructions are allowed independent of FeatureUnalignedScratchAccess at some places. This patch always uses this interpretation.

Part of SWDEV-455845.

>From d75f9a3a583ada0378ea0d9056bf99d1a08e0436 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Fri, 27 Sep 2024 03:25:04 -0400
Subject: [PATCH] [AMDGPU] Enable unaligned scratch accesses

This allows us to emit wide generic and scratch memory accesses when we do not
have alignment information. In cases where accesses happen to be properly
aligned or where generic accesses do not go to scratch memory, this improves
performance of the generated code by a factor of up to 16x and reduces code
size, especially when lowering memcpy and memmove intrinsics.

Also: Make the use of the FeatureUnalignedScratchAccess feature more
consistent: Code has already assumed that unaligned accesses with the
specialized flat scratch instructions are allowed independent of
FeatureUnalignedScratchAccess at some places. This patch always uses this
interpretation.

Part of SWDEV-455845.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   18 +-
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |    5 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |    4 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   17 +-
 .../AMDGPU/GlobalISel/legalize-load-flat.mir  | 4354 ++++++-----
 llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll    |   23 +-
 .../test/CodeGen/AMDGPU/flat-address-space.ll |   12 +-
 .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll |   98 +-
 llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll    | 2438 +-----
 .../AMDGPU/memcpy-param-combinations.ll       | 6516 +++--------------
 .../AMDGPU/memmove-param-combinations.ll      | 5196 ++-----------
 11 files changed, 4654 insertions(+), 14027 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 3626fd8bc78c15..bb548575a3e083 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1171,9 +1171,9 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
    FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
    FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
    FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
-   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
-   FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero,
-   FeatureVmemWriteVgprInOrder
+   FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
+   FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS,
+   FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder
   ]
 >;
 
@@ -1192,9 +1192,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
    FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
-   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
-   FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
-   FeatureMaxHardClauseLength63,
+   FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
+   FeatureUnalignedDSAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
+   FeatureDefaultComponentZero, FeatureMaxHardClauseLength63,
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
    FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
    FeatureVmemWriteVgprInOrder
@@ -1216,9 +1216,9 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
    FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureA16, FeatureFastDenormalF32, FeatureG16,
-   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
-   FeatureGWS, FeatureDefaultComponentZero,
-   FeatureMaxHardClauseLength32,
+   FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
+   FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS,
+   FeatureDefaultComponentZero, FeatureMaxHardClauseLength32,
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
    FeatureVmemWriteVgprInOrder
   ]
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 4cf7733a260ff0..909b0e2f1c9861 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -387,8 +387,9 @@ bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
   // them later if they may access private memory. We don't have enough context
   // here, and legalization can handle it.
   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
-    return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
-      ChainSizeInBytes <= ST->getMaxPrivateElementSize();
+    return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled() ||
+            ST->enableFlatScratch()) &&
+           ChainSizeInBytes <= ST->getMaxPrivateElementSize();
   }
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index e6b7342d5fffcf..45d3ded9fa7bd5 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -590,6 +590,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return UnalignedScratchAccess;
   }
 
+  bool hasUnalignedScratchAccessEnabled() const {
+    return UnalignedScratchAccess && UnalignedAccessMode;
+  }
+
   bool hasUnalignedAccessMode() const {
     return UnalignedAccessMode;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 885ecab891b1f5..bf0f806ca50011 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1831,26 +1831,17 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
            Subtarget->hasUnalignedDSAccessEnabled();
   }
 
-  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
-    bool AlignedBy4 = Alignment >= Align(4);
-    if (IsFast)
-      *IsFast = AlignedBy4;
-
-    return AlignedBy4 ||
-           Subtarget->enableFlatScratch() ||
-           Subtarget->hasUnalignedScratchAccess();
-  }
-
   // FIXME: We have to be conservative here and assume that flat operations
   // will access scratch.  If we had access to the IR function, then we
   // could determine if any private memory was used in the function.
-  if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
-      !Subtarget->hasUnalignedScratchAccess()) {
+  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
+      AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
     bool AlignedBy4 = Alignment >= Align(4);
     if (IsFast)
       *IsFast = AlignedBy4;
 
-    return AlignedBy4;
+    return AlignedBy4 || Subtarget->enableFlatScratch() ||
+           Subtarget->hasUnalignedScratchAccessEnabled();
   }
 
   // So long as they are correct, wide global memory operations perform better
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
index e67f3620d013c7..bc33dc2a9d1926 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
@@ -1,10 +1,10 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=CI %s
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=VI %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX9 %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX9PLUS %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX9PLUS %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX11PLUS %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -O0 -run-pass=legalizer  %s -o - | FileCheck -check-prefix=GFX11PLUS %s
 
 ---
 name: test_load_flat_s1_align1
@@ -30,14 +30,23 @@ body: |
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; VI-NEXT: $vgpr0 = COPY [[AND]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s1_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s1_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s1_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[AND]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s1) = G_LOAD %0 :: (load (s1), align 1, addrspace 0)
     %2:_(s32) = G_ZEXT %1
@@ -68,14 +77,23 @@ body: |
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; VI-NEXT: $vgpr0 = COPY [[AND]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s2_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s2_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s2_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[AND]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s2) = G_LOAD %0 :: (load (s2), align 1, addrspace 0)
     %2:_(s32) = G_ZEXT %1
@@ -102,12 +120,19 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s8_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s8_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s8_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s8) = G_LOAD %0 :: (load (s8), align 4, addrspace 0)
     %2:_(s32) = G_ANYEXT %1
@@ -134,12 +159,19 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s8_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s8_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s8_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s8) = G_LOAD %0 :: (load (s8), align 1, addrspace 0)
     %2:_(s32) = G_ANYEXT %1
@@ -166,12 +198,19 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s16_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s16_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s16_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s16) = G_LOAD %0 :: (load (s16), align 4, addrspace 0)
     %2:_(s32) = G_ANYEXT %1
@@ -198,12 +237,19 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s16_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s16_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s16_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s16) = G_LOAD %0 :: (load (s16), align 2, addrspace 0)
     %2:_(s32) = G_ANYEXT %1
@@ -242,18 +288,25 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s16_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s16_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s16_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s16) = G_LOAD %0 :: (load (s16), align 1, addrspace 0)
     %2:_(s32) = G_ANYEXT %1
@@ -280,12 +333,19 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s32_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s32_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s32_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_LOAD %0 :: (load (s32), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -323,18 +383,25 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s32_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s32_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s32_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_LOAD %0 :: (load (s32), align 2, addrspace 0)
     $vgpr0 = COPY %1
@@ -392,28 +459,35 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s32_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s32_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s32_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_LOAD %0 :: (load (s32), align 1, addrspace 0)
     $vgpr0 = COPY %1
@@ -471,14 +545,23 @@ body: |
     ; VI-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C4]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[AND2]](s64)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s48_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
-    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s48_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
+    ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s48_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
+    ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s48) = G_LOAD %0 :: (load (s48), align 8, addrspace 0)
     %2:_(s64) = G_ZEXT %1
@@ -513,12 +596,19 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s64_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s64_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s64_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -552,12 +642,19 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s64_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s64_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s64_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -611,30 +708,37 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s64_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s64_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s64_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 2, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -724,48 +828,55 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s64_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s64_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s64_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -807,13 +918,21 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s96_align16
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s96_align16
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s96_align16
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 16, addrspace 0)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -855,13 +974,21 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s96_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 8)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s96_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 8)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s96_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 8)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 8, addrspace 0)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -903,13 +1030,21 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s96_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s96_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s96_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -979,34 +1114,42 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s96_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s96_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s96_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 2, addrspace 0)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -1128,60 +1271,68 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s96_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s96_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s96_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 0)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -1235,18 +1386,31 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
     ; VI-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s160_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 16)
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[LOAD1]](s32)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
-    ; GFX9-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s160_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[LOAD1]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
+    ; GFX9PLUS-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s160_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[LOAD1]](s32)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
+    ; GFX11PLUS-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s160) = G_LOAD %0 :: (load (s160), align 4, addrspace 0)
     S_NOP 0, implicit %1
@@ -1316,21 +1480,37 @@ body: |
     ; VI-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s224_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<3 x s32>) from unknown-address + 16, align 4)
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
-    ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<3 x s32>)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s224) = G_BITCAST [[BUILD_VECTOR]](<7 x s32>)
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s224_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<3 x s32>) from unknown-address + 16, align 4)
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<3 x s32>)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s224) = G_BITCAST [[BUILD_VECTOR]](<7 x s32>)
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s224_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<3 x s32>) from unknown-address + 16, align 4)
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<3 x s32>)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[UV4]](s32), [[UV5]](s32), [[UV6]](s32)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s224) = G_BITCAST [[BUILD_VECTOR]](<7 x s32>)
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s224) = G_LOAD %0 :: (load (s224), align 4, addrspace 0)
      %2:_(s256) = G_IMPLICIT_DEF
@@ -1381,13 +1561,21 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s128_align16
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s128_align16
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s128_align16
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 16, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -1435,13 +1623,21 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s128_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s128_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s128_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -1593,75 +1789,83 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s128_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s128_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s128_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -1733,17 +1937,29 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s256_align32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s256_align32
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s256_align32
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s256) = G_LOAD %0 :: (load (s256), align 16, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1
@@ -1777,12 +1993,19 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p1_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p1_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p1_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p1) = G_LOAD %0 :: (load (p1), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -1816,12 +2039,19 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p1_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p1_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p1_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p1) = G_LOAD %0 :: (load (p1), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -1911,49 +2141,56 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p1_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p1_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p1_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p1) = G_LOAD %0 :: (load (p1), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -1979,12 +2216,19 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](p3)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p3_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3))
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p3_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3))
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p3_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3))
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p3)
      %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p3) = G_LOAD %0 :: (load (p3), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -2018,12 +2262,19 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p4)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p4_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p4_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p4_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p4) = G_LOAD %0 :: (load (p4), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -2057,12 +2308,19 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p4)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p4_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p4_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p4_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p4) = G_LOAD %0 :: (load (p4), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -2116,31 +2374,38 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p4)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p4_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p4_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p4_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p4) = G_LOAD %0 :: (load (p4), align 2, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -2230,49 +2495,56 @@ body: |
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p4)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p4_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p4_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p4_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p4) = G_LOAD %0 :: (load (p4), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -2298,12 +2570,19 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p5_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5))
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p5_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5))
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p5_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5))
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p5) = G_LOAD %0 :: (load (p5), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -2343,19 +2622,26 @@ body: |
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p5_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
-    ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p5_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p5_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p5) = G_LOAD %0 :: (load (p5), align 2, addrspace 0)
     $vgpr0 = COPY %1
@@ -2415,29 +2701,36 @@ body: |
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     ;
-    ; GFX9-LABEL: name: test_load_flat_p5_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
-    ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX9PLUS-LABEL: name: test_load_flat_p5_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_p5_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p5) = G_LOAD %0 :: (load (p5), align 1, addrspace 0)
     $vgpr0 = COPY %1
@@ -2463,12 +2756,19 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s8_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s8_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s8_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s8>) = G_LOAD %0 :: (load (<2 x s8>), align 4, addrspace 0)
     %2:_(s16) = G_BITCAST %1
@@ -2496,12 +2796,19 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s8_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s8_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s8_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s8>) = G_LOAD %0 :: (load (<2 x s8>), align 2, addrspace 0)
     %2:_(s16) = G_BITCAST %1
@@ -2541,18 +2848,25 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s8_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s8_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s8_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s8>) = G_LOAD %0 :: (load (<2 x s8>), align 1, addrspace 0)
     %2:_(s16) = G_BITCAST %1
@@ -2628,35 +2942,65 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s8_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C3]](s16)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
-    ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s8_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
+    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C3]](s16)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s8_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C3]](s16)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
+    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
+    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C3]](s16)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
+    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s8>) = G_LOAD %0 :: (load (<3 x s8>), align 4, addrspace 0)
     %2:_(s24) = G_BITCAST %1
@@ -2752,45 +3096,80 @@ body: |
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; VI-NEXT: $vgpr0 = COPY [[OR4]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s8_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
-    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s8_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s8_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
+    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
+    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR3]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s8>) = G_LOAD %0 :: (load (<3 x s8>), align 1, addrspace 0)
     %2:_(s24) = G_BITCAST %1
@@ -2818,12 +3197,19 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s8_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s8_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s8>) = G_LOAD %0 :: (load (<4 x s8>), align 4, addrspace 0)
     %2:_(s32) = G_BITCAST %1
@@ -2862,18 +3248,25 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s8_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s8_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s8>) = G_LOAD %0 :: (load (<4 x s8>), align 2, addrspace 0)
     %2:_(s32) = G_BITCAST %1
@@ -2932,28 +3325,35 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s8_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s8_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s8>) = G_LOAD %0 :: (load (<4 x s8>), align 1, addrspace 0)
     %2:_(s32) = G_BITCAST %1
@@ -2988,12 +3388,19 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v8s8_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v8s8_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v8s8_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<8 x s8>) = G_LOAD %0 :: (load (<8 x s8>), align 8, addrspace 0)
     %2:_(<2 x s32>) = G_BITCAST %1
@@ -3040,12 +3447,19 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v16s8_align16
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v16s8_align16
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v16s8_align16
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<16 x s8>) = G_LOAD %0 :: (load (<16 x s8>), align 16, addrspace 0)
     %2:_(<4 x s32>) = G_BITCAST %1
@@ -3116,16 +3530,27 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v32s8_align32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v32s8_align32
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v32s8_align32
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<32 x s8>) = G_LOAD %0 :: (load (<32 x s8>), align 32, addrspace 0)
     %2:_(<8 x s32>) = G_BITCAST %1
@@ -3153,12 +3578,19 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s16_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>))
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s16_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>))
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>))
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -3204,18 +3636,25 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s16_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s16_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 2, addrspace 0)
     $vgpr0 = COPY %1
@@ -3281,28 +3720,35 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s16_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s16_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 1, addrspace 0)
     $vgpr0 = COPY %1
@@ -3380,27 +3826,49 @@ body: |
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s16_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s16_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s16_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[LOAD]](<4 x s16>)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX11PLUS-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 8, addrspace 0)
     %2:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -3480,34 +3948,63 @@ body: |
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s16_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 4)
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
-    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s16_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 4)
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s16_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 4)
+    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; GFX11PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 4, addrspace 0)
     %2:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -3591,34 +4088,63 @@ body: |
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s16_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
-    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s16_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s16_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; GFX11PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 2, addrspace 0)
     %2:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -3730,48 +4256,77 @@ body: |
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s16_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
-    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s16_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s16_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1)
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1)
+    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; GFX11PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s16>) = G_LOAD %0 :: (load (<3 x s16>), align 1, addrspace 0)
     %2:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -3807,12 +4362,19 @@ body: |
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s16_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s16_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -3846,12 +4408,19 @@ body: |
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s16_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s16_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -3919,28 +4488,35 @@ body: |
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s16_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s16_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 2, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -4044,46 +4620,53 @@ body: |
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s16_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s16_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -4131,13 +4714,21 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v8s16_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v8s16_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v8s16_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 8, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -4171,12 +4762,19 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s32_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s32_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s32_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -4210,12 +4808,19 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s32_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s32_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s32_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -4250,12 +4855,19 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s32_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s32_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s32_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -4295,12 +4907,19 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s32_align16
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s32_align16
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s32_align16
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 16)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 16, addrspace 0)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -4342,12 +4961,19 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s32_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s32_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s32_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -4393,12 +5019,19 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s32_align16
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s32_align16
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s32_align16
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 16, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -4444,12 +5077,19 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s32_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s32_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s32_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 8, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -4495,12 +5135,19 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s32_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s32_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s32_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -4570,16 +5217,27 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v8s32_align32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v8s32_align32
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v8s32_align32
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>), align 32, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1
@@ -4697,22 +5355,39 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BUILD_VECTOR]](<16 x s32>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v16s32_align32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<4 x s32>) from unknown-address + 32, align 32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p0) :: (load (<4 x s32>) from unknown-address + 48)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v16s32_align32
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<4 x s32>) from unknown-address + 32, align 32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p0) :: (load (<4 x s32>) from unknown-address + 48)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v16s32_align32
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<4 x s32>) from unknown-address + 32, align 32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
+    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p0) :: (load (<4 x s32>) from unknown-address + 48)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>), align 32, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %1
@@ -4760,12 +5435,19 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s64_align16
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>))
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s64_align16
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align16
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 16, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -4813,12 +5495,19 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s64_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s64_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 8, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -4866,12 +5555,19 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s64_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s64_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -4955,49 +5651,56 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s64_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s64_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
+    ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 2, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -5149,83 +5852,90 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s64_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX9-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s64_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 1, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -5289,19 +5999,33 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s64_align32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 16)
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s64_align32
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 16)
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s64_align32
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 16)
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s64>) = G_LOAD %0 :: (load (<3 x s64>), align 32, addrspace 0)
     %2:_(<4 x s64>) = G_IMPLICIT_DEF
@@ -5367,19 +6091,33 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s64_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16)
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s64_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s64_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s64>) = G_LOAD %0 :: (load (<3 x s64>), align 8, addrspace 0)
     %2:_(<4 x s64>) = G_IMPLICIT_DEF
@@ -5597,119 +6335,133 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v3s64_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX9-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX9-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX9-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX9-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX9-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX9-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX9-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX9-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX9-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX9-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX9-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX9-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX9-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX9-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX9-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX9-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX9-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v3s64_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v3s64_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1)
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s64>) = G_LOAD %0 :: (load (<3 x s64>), align 1, addrspace 0)
     %2:_(<4 x s64>) = G_IMPLICIT_DEF
@@ -5783,16 +6535,27 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s64_align32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s64_align32
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s64_align32
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>), align 32, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1
@@ -5864,16 +6627,27 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s64_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 8)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s64_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 8)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s64_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 8)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 8)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>), align 8, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1
@@ -6145,152 +6919,163 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v4s64_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX9-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX9-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX9-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX9-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX9-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX9-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX9-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX9-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX9-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX9-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX9-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX9-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX9-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX9-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX9-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX9-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX9-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
-    ; GFX9-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
-    ; GFX9-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
-    ; GFX9-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
-    ; GFX9-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
-    ; GFX9-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
-    ; GFX9-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
-    ; GFX9-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
-    ; GFX9-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
-    ; GFX9-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
-    ; GFX9-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
-    ; GFX9-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
-    ; GFX9-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
-    ; GFX9-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
-    ; GFX9-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
-    ; GFX9-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
-    ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
-    ; GFX9-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v4s64_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
+    ; GFX9PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
+    ; GFX9PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
+    ; GFX9PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
+    ; GFX9PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
+    ; GFX9PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
+    ; GFX9PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
+    ; GFX9PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
+    ; GFX9PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
+    ; GFX9PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
+    ; GFX9PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
+    ; GFX9PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
+    ; GFX9PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
+    ; GFX9PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
+    ; GFX9PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX9PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
+    ; GFX9PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v4s64_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>), align 1, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1
@@ -6362,17 +7147,29 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2s128_align32
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2s128_align32
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2s128_align32
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p0) :: (load (<4 x s32>) from unknown-address + 16)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[CONCAT_VECTORS]](<8 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s128>) = G_LOAD %0 :: (load (<2 x s128>), align 32, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1
@@ -6420,13 +7217,21 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2p1_align16
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2p1_align16
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2p1_align16
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>))
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 16, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -6474,13 +7279,21 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2p1_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2p1_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2p1_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 8)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 8, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -6528,13 +7341,21 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2p1_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2p1_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2p1_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 4)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -6686,75 +7507,83 @@ body: |
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2p1_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2p1_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2p1_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 1, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -6788,12 +7617,19 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2p3_align8
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>))
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2p3_align8
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>))
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align8
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>))
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 8, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -6827,12 +7663,19 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2p3_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2p3_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -6926,46 +7769,53 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
     ;
-    ; GFX9-LABEL: name: test_load_flat_v2p3_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ; GFX9PLUS-LABEL: name: test_load_flat_v2p3_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX9PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -6991,12 +7841,19 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_ext_load_flat_s32_from_1_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_ext_load_flat_s32_from_1_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_ext_load_flat_s32_from_1_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_LOAD %0 :: (load (s8), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -7022,12 +7879,19 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_ext_load_flat_s32_from_2_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_ext_load_flat_s32_from_2_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_ext_load_flat_s32_from_2_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_LOAD %0 :: (load (s16), align 4, addrspace 0)
     $vgpr0 = COPY %1
@@ -7056,13 +7920,21 @@ body: |
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ;
-    ; GFX9-LABEL: name: test_ext_load_flat_s64_from_1_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ; GFX9PLUS-LABEL: name: test_ext_load_flat_s64_from_1_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_ext_load_flat_s64_from_1_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s8), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -7090,13 +7962,21 @@ body: |
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ;
-    ; GFX9-LABEL: name: test_ext_load_flat_s64_from_2_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ; GFX9PLUS-LABEL: name: test_ext_load_flat_s64_from_2_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_ext_load_flat_s64_from_2_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s16), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -7124,13 +8004,21 @@ body: |
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ;
-    ; GFX9-LABEL: name: test_ext_load_flat_s64_from_4_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ; GFX9PLUS-LABEL: name: test_ext_load_flat_s64_from_4_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_ext_load_flat_s64_from_4_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s32), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -7164,16 +8052,27 @@ body: |
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
     ;
-    ; GFX9-LABEL: name: test_ext_load_flat_s128_from_4_align4
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
-    ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[DEF]](s32)
-    ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
-    ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ; GFX9PLUS-LABEL: name: test_ext_load_flat_s128_from_4_align4
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[DEF]](s32)
+    ; GFX9PLUS-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; GFX9PLUS-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
+    ; GFX11PLUS-LABEL: name: test_ext_load_flat_s128_from_4_align4
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[DEF]](s32)
+    ; GFX11PLUS-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; GFX11PLUS-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s128) = G_LOAD %0 :: (load (s32), align 4, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -7201,13 +8100,21 @@ body: |
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ;
-    ; GFX9-LABEL: name: test_ext_load_flat_s64_from_2_align2
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ; GFX9PLUS-LABEL: name: test_ext_load_flat_s64_from_2_align2
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_ext_load_flat_s64_from_2_align2
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s16), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -7235,13 +8142,21 @@ body: |
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     ;
-    ; GFX9-LABEL: name: test_ext_load_flat_s64_from_1_align1
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
-    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
-    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ; GFX9PLUS-LABEL: name: test_ext_load_flat_s64_from_1_align1
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
+    ; GFX11PLUS-LABEL: name: test_ext_load_flat_s64_from_1_align1
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
+    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s8), align 4, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -7268,12 +8183,19 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 536870912)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
-    ; GFX9-LABEL: name: test_load_flat_s32_align536870912
-    ; GFX9: liveins: $vgpr0_vgpr1
-    ; GFX9-NEXT: {{  $}}
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 536870912)
-    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX9PLUS-LABEL: name: test_load_flat_s32_align536870912
+    ; GFX9PLUS: liveins: $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: {{  $}}
+    ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 536870912)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
+    ; GFX11PLUS-LABEL: name: test_load_flat_s32_align536870912
+    ; GFX11PLUS: liveins: $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: {{  $}}
+    ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 536870912)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_LOAD %0 :: (load (s16), align 536870912)
     $vgpr0 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index ea10547da6ab7f..fd2d0bc901f3fc 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -703,12 +703,12 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
 ; FLATSCR-LABEL: chain_hi_to_lo_private_other_dep:
 ; FLATSCR:       ; %bb.0: ; %bb
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, v0, off
+; FLATSCR-NEXT:    scratch_load_dword v0, v0, off
+; FLATSCR-NEXT:    s_mov_b32 s0, 0x7060302
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; FLATSCR-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
-; FLATSCR-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
+; FLATSCR-NEXT:    v_perm_b32 v0, v1, v0, s0
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_other_dep:
@@ -725,23 +725,22 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
 ; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_other_dep:
 ; FLATSCR_GFX10:       ; %bb.0: ; %bb
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v1, v0, off
+; FLATSCR_GFX10-NEXT:    scratch_load_dword v0, v0, off
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; FLATSCR_GFX10-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
-; FLATSCR_GFX10-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
-; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; FLATSCR_GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; FLATSCR_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_private_other_dep:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, v0, off
+; GFX11-NEXT:    scratch_load_b32 v0, v0, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
-; GFX11-NEXT:    scratch_load_d16_b16 v1, v0, off offset:2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds i16, ptr addrspace(5) %ptr, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
index 0ad53083d0ff3f..12593e3760fd3e 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -123,10 +123,8 @@ define amdgpu_kernel void @zextload_flat_i16(ptr addrspace(1) noalias %out, ptr
 }
 
 ; GCN-LABEL: flat_scratch_unaligned_load:
-; GCN: flat_load_{{ubyte|u8}}
-; GCN: flat_load_{{ubyte|u8}}
-; GCN: flat_load_{{ubyte|u8}}
-; GCN: flat_load_{{ubyte|u8}}
+; GFX9: flat_load_dword
+; GFX10PLUS: flat_load_{{dword|b32}}
 define amdgpu_kernel void @flat_scratch_unaligned_load() {
   %scratch = alloca i32, addrspace(5)
   %fptr = addrspacecast ptr addrspace(5) %scratch to ptr
@@ -136,10 +134,8 @@ define amdgpu_kernel void @flat_scratch_unaligned_load() {
 }
 
 ; GCN-LABEL: flat_scratch_unaligned_store:
-; GCN: flat_store_{{byte|b8}}
-; GCN: flat_store_{{byte|b8}}
-; GCN: flat_store_{{byte|b8}}
-; GCN: flat_store_{{byte|b8}}
+; GFX9: flat_store_dword
+; GFX10PLUS: flat_store_{{dword|b32}}
 define amdgpu_kernel void @flat_scratch_unaligned_store() {
   %scratch = alloca i32, addrspace(5)
   %fptr = addrspacecast ptr addrspace(5) %scratch to ptr
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
index 1dd18b4228fe5e..9d43efbdf07b1f 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
@@ -16,47 +16,18 @@ define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_mov_b32_e32 v9, s7
 ; CHECK-NEXT:    v_mov_b32_e32 v8, s6
-; CHECK-NEXT:    flat_load_ubyte v10, v[8:9] offset:5
-; CHECK-NEXT:    flat_load_ubyte v11, v[8:9] offset:6
-; CHECK-NEXT:    flat_load_ubyte v12, v[8:9] offset:7
-; CHECK-NEXT:    flat_load_ubyte v13, v[8:9] offset:3
-; CHECK-NEXT:    flat_load_ubyte v14, v[8:9] offset:2
-; CHECK-NEXT:    flat_load_ubyte v15, v[8:9] offset:1
-; CHECK-NEXT:    flat_load_ubyte v16, v[8:9]
-; CHECK-NEXT:    flat_load_ubyte v17, v[8:9] offset:4
-; CHECK-NEXT:    flat_load_ubyte v18, v[8:9] offset:13
-; CHECK-NEXT:    flat_load_ubyte v19, v[8:9] offset:14
-; CHECK-NEXT:    flat_load_ubyte v20, v[8:9] offset:15
-; CHECK-NEXT:    flat_load_ubyte v21, v[8:9] offset:11
-; CHECK-NEXT:    flat_load_ubyte v22, v[8:9] offset:10
-; CHECK-NEXT:    flat_load_ubyte v23, v[8:9] offset:9
-; CHECK-NEXT:    flat_load_ubyte v24, v[8:9] offset:8
-; CHECK-NEXT:    flat_load_ubyte v25, v[8:9] offset:12
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
 ; CHECK-NEXT:    s_add_u32 s4, s4, 1
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    v_add_co_u32_e32 v8, vcc, s6, v6
+; CHECK-NEXT:    v_mov_b32_e32 v13, s7
+; CHECK-NEXT:    v_add_co_u32_e32 v12, vcc, s6, v6
 ; CHECK-NEXT:    v_cmp_ge_u64_e64 s[8:9], s[4:5], 2
-; CHECK-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v7, vcc
+; CHECK-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v7, vcc
 ; CHECK-NEXT:    s_add_u32 s6, s6, 16
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[8:9], v13 offset:3
-; CHECK-NEXT:    flat_store_byte v[8:9], v14 offset:2
-; CHECK-NEXT:    flat_store_byte v[8:9], v15 offset:1
-; CHECK-NEXT:    flat_store_byte v[8:9], v16
-; CHECK-NEXT:    flat_store_byte v[8:9], v12 offset:7
-; CHECK-NEXT:    flat_store_byte v[8:9], v11 offset:6
-; CHECK-NEXT:    flat_store_byte v[8:9], v10 offset:5
-; CHECK-NEXT:    flat_store_byte v[8:9], v17 offset:4
-; CHECK-NEXT:    flat_store_byte v[8:9], v21 offset:11
-; CHECK-NEXT:    flat_store_byte v[8:9], v22 offset:10
-; CHECK-NEXT:    flat_store_byte v[8:9], v23 offset:9
-; CHECK-NEXT:    flat_store_byte v[8:9], v24 offset:8
-; CHECK-NEXT:    flat_store_byte v[8:9], v20 offset:15
-; CHECK-NEXT:    flat_store_byte v[8:9], v19 offset:14
-; CHECK-NEXT:    flat_store_byte v[8:9], v18 offset:13
-; CHECK-NEXT:    flat_store_byte v[8:9], v25 offset:12
+; CHECK-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_2
 ; CHECK-NEXT:  ; %bb.3: ; %loop-memcpy-residual-header
 ; CHECK-NEXT:    s_mov_b32 s4, 0
@@ -128,47 +99,18 @@ define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v10, s10
 ; CHECK-NEXT:    v_mov_b32_e32 v11, s11
-; CHECK-NEXT:    flat_load_ubyte v12, v[10:11] offset:5
-; CHECK-NEXT:    flat_load_ubyte v13, v[10:11] offset:6
-; CHECK-NEXT:    flat_load_ubyte v14, v[10:11] offset:7
-; CHECK-NEXT:    flat_load_ubyte v15, v[10:11] offset:3
-; CHECK-NEXT:    flat_load_ubyte v16, v[10:11] offset:2
-; CHECK-NEXT:    flat_load_ubyte v17, v[10:11] offset:1
-; CHECK-NEXT:    flat_load_ubyte v18, v[10:11]
-; CHECK-NEXT:    flat_load_ubyte v19, v[10:11] offset:4
-; CHECK-NEXT:    flat_load_ubyte v20, v[10:11] offset:13
-; CHECK-NEXT:    flat_load_ubyte v21, v[10:11] offset:14
-; CHECK-NEXT:    flat_load_ubyte v22, v[10:11] offset:15
-; CHECK-NEXT:    flat_load_ubyte v23, v[10:11] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[10:11] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[10:11] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[10:11] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[10:11] offset:12
+; CHECK-NEXT:    flat_load_dwordx4 v[10:13], v[10:11]
+; CHECK-NEXT:    v_mov_b32_e32 v15, s11
 ; CHECK-NEXT:    s_add_u32 s14, s14, 1
-; CHECK-NEXT:    v_add_co_u32_e32 v10, vcc, s10, v2
-; CHECK-NEXT:    v_addc_co_u32_e32 v11, vcc, v11, v3, vcc
+; CHECK-NEXT:    v_add_co_u32_e32 v14, vcc, s10, v2
+; CHECK-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v3, vcc
 ; CHECK-NEXT:    s_addc_u32 s15, s15, 0
 ; CHECK-NEXT:    s_add_u32 s10, s10, 16
 ; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc, s[14:15], v[4:5]
 ; CHECK-NEXT:    s_addc_u32 s11, s11, 0
 ; CHECK-NEXT:    s_or_b64 s[12:13], vcc, s[12:13]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[10:11], v15 offset:3
-; CHECK-NEXT:    flat_store_byte v[10:11], v16 offset:2
-; CHECK-NEXT:    flat_store_byte v[10:11], v17 offset:1
-; CHECK-NEXT:    flat_store_byte v[10:11], v18
-; CHECK-NEXT:    flat_store_byte v[10:11], v14 offset:7
-; CHECK-NEXT:    flat_store_byte v[10:11], v13 offset:6
-; CHECK-NEXT:    flat_store_byte v[10:11], v12 offset:5
-; CHECK-NEXT:    flat_store_byte v[10:11], v19 offset:4
-; CHECK-NEXT:    flat_store_byte v[10:11], v23 offset:11
-; CHECK-NEXT:    flat_store_byte v[10:11], v24 offset:10
-; CHECK-NEXT:    flat_store_byte v[10:11], v25 offset:9
-; CHECK-NEXT:    flat_store_byte v[10:11], v26 offset:8
-; CHECK-NEXT:    flat_store_byte v[10:11], v22 offset:15
-; CHECK-NEXT:    flat_store_byte v[10:11], v21 offset:14
-; CHECK-NEXT:    flat_store_byte v[10:11], v20 offset:13
-; CHECK-NEXT:    flat_store_byte v[10:11], v27 offset:12
+; CHECK-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
 ; CHECK-NEXT:    s_andn2_b64 exec, exec, s[12:13]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_14
 ; CHECK-NEXT:  .LBB0_15: ; %Flow20
@@ -251,23 +193,11 @@ define void @issue63986_reduced_expanded(i64 %idxprom) {
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:  .LBB1_8: ; %post-loop-memcpy-expansion
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, v2
+; CHECK-NEXT:    v_mov_b32_e32 v4, v2
+; CHECK-NEXT:    v_mov_b32_e32 v5, v2
 ; CHECK-NEXT:    s_and_b64 vcc, exec, 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:3
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:2
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:1
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:7
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:6
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:5
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:4
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:9
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:8
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:12
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:  .LBB1_9: ; %loop-memcpy-expansion2
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccz .LBB1_9
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 0a76e169e9c385..8c28fac0d839c2 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -10,108 +10,21 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1]
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:1
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:2
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:3
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:4
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:5
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:6
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:8
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:9
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:10
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:11
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:12
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:13
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:14
-; CHECK-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    v_mov_b32_e32 v12, s3
+; CHECK-NEXT:    v_mov_b32_e32 v11, s2
+; CHECK-NEXT:    flat_load_ubyte v13, v[11:12] offset:46
+; CHECK-NEXT:    flat_load_ushort v14, v[11:12] offset:44
+; CHECK-NEXT:    flat_load_dwordx3 v[8:10], v[11:12] offset:32
+; CHECK-NEXT:    flat_load_dwordx4 v[0:3], v[11:12] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[11:12]
+; CHECK-NEXT:    v_mov_b32_e32 v12, s1
+; CHECK-NEXT:    v_mov_b32_e32 v11, s0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:1
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:2
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:3
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:4
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:5
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:6
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:7
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:8
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:9
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:10
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:11
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:12
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:13
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:14
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:17
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[0:1] offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:30
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:29
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:28
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:27
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:26
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:25
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:24
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:23
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:22
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:21
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:20
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:19
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:18
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:17
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:16
-; CHECK-NEXT:    flat_store_byte v[2:3], v19 offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:46
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:45
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:44
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:43
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:42
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:41
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:40
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:39
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:38
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:37
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:36
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:35
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:34
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:33
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_load_ubyte v0, v[0:1] offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:46
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:45
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:44
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:43
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:42
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:41
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:40
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:39
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:38
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:37
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:36
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:35
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:34
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:33
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:32
-; CHECK-NEXT:    flat_store_byte v[2:3], v0 offset:31
+; CHECK-NEXT:    flat_store_byte v[11:12], v13 offset:46
+; CHECK-NEXT:    flat_store_short v[11:12], v14 offset:44
+; CHECK-NEXT:    flat_store_dwordx3 v[11:12], v[8:10] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[0:3] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -185,375 +98,59 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v24, 0
 ; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:15
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:14
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:13
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:12
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:11
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:10
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:9
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:8
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:7
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:6
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:5
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:4
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:3
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:2
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:1
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1]
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:31
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s2
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:10
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:23
+; CHECK-NEXT:    v_mov_b32_e32 v25, s2
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    buffer_store_dword v3, v25, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_store_dword v2, v25, s[16:19], 0 offen offset:120
+; CHECK-NEXT:    buffer_store_dword v1, v25, s[16:19], 0 offen offset:116
+; CHECK-NEXT:    buffer_store_dword v0, v25, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    buffer_store_dword v7, v25, s[16:19], 0 offen offset:108
+; CHECK-NEXT:    buffer_store_dword v6, v25, s[16:19], 0 offen offset:104
+; CHECK-NEXT:    buffer_store_dword v5, v25, s[16:19], 0 offen offset:100
+; CHECK-NEXT:    buffer_store_dword v4, v25, s[16:19], 0 offen offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1]
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    buffer_store_dword v11, v25, s[16:19], 0 offen offset:92
+; CHECK-NEXT:    buffer_store_dword v10, v25, s[16:19], 0 offen offset:88
+; CHECK-NEXT:    buffer_store_dword v9, v25, s[16:19], 0 offen offset:84
+; CHECK-NEXT:    buffer_store_dword v8, v25, s[16:19], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(16)
+; CHECK-NEXT:    buffer_store_dword v15, v25, s[16:19], 0 offen offset:76
+; CHECK-NEXT:    buffer_store_dword v14, v25, s[16:19], 0 offen offset:72
+; CHECK-NEXT:    buffer_store_dword v13, v25, s[16:19], 0 offen offset:68
+; CHECK-NEXT:    buffer_store_dword v12, v25, s[16:19], 0 offen offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:9
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:8
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:7
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:20
+; CHECK-NEXT:    buffer_store_dword v19, v25, s[16:19], 0 offen offset:60
+; CHECK-NEXT:    buffer_store_dword v18, v25, s[16:19], 0 offen offset:56
+; CHECK-NEXT:    buffer_store_dword v17, v25, s[16:19], 0 offen offset:52
+; CHECK-NEXT:    buffer_store_dword v16, v25, s[16:19], 0 offen offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:6
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:5
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:2
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:47
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:4
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:17
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:3
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:16
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:27
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:26
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:25
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:24
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:45
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:44
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:43
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:23
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:36
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:22
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:35
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:21
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:34
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:20
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:33
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:19
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:28
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:29
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:42
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:18
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:63
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:16
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:61
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:27
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:40
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:26
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:39
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:25
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:38
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:24
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:37
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:44
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:57
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:43
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:56
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:45
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:58
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:36
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:49
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:35
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:48
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:46
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:47
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:60
-; CHECK-NEXT:    s_waitcnt vmcnt(33)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:34
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:79
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:28
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:41
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:42
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:55
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:33
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:32
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:77
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:61
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:74
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:40
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:53
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:39
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:52
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:38
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:51
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:37
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:50
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:57
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:70
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:56
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:69
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:58
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:71
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:49
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:48
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:93
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:46
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:59
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:60
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:73
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:41
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:54
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:55
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:68
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:74
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:87
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:53
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:66
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:52
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:65
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:51
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:64
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:62
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:63
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:76
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:50
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:95
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:77
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:90
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:71
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:83
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:70
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:69
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:59
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:72
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:73
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:85
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:54
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:67
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:68
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:81
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:66
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:111
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:65
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:110
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:64
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:109
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:62
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:75
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:76
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:89
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:90
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:103
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:72
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:86
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:84
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:82
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:87
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:100
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:67
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:80
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:78
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:94
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:79
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:92
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:95
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:108
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:93
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:75
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:88
+; CHECK-NEXT:    buffer_store_dword v23, v25, s[16:19], 0 offen offset:44
+; CHECK-NEXT:    buffer_store_dword v22, v25, s[16:19], 0 offen offset:40
+; CHECK-NEXT:    buffer_store_dword v21, v25, s[16:19], 0 offen offset:36
+; CHECK-NEXT:    buffer_store_dword v20, v25, s[16:19], 0 offen offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    buffer_store_dword v3, v25, s[16:19], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v2, v25, s[16:19], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v1, v25, s[16:19], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v0, v25, s[16:19], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:89
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:102
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:78
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:91
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:94
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:107
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:92
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:105
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:88
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:101
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:91
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:104
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:86
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:85
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:84
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:83
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:82
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:96
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:97
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:98
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:99
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:120
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:81
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:80
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:111
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:110
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:109
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:108
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:100
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:121
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:122
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:123
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:124
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:125
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:126
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:107
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:127
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:105
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:103
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:102
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:101
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:116
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:117
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:119
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:114
-; CHECK-NEXT:    s_waitcnt vmcnt(34)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:104
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:118
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:115
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:113
-; CHECK-NEXT:    global_load_ubyte v21, v0, s[0:1] offset:112
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:99
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:98
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:97
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:96
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:127
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:126
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:125
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:124
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:123
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:122
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:121
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:120
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:119
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:118
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:117
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:116
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:115
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:114
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:113
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v21, v1, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    buffer_store_dword v7, v25, s[16:19], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v6, v25, s[16:19], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v25, s[16:19], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v4, v25, s[16:19], 0 offen
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -569,363 +166,57 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2
+; CHECK-NEXT:    v_mov_b32_e32 v26, s0
+; CHECK-NEXT:    buffer_load_dword v3, v26, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v2, v26, s[16:19], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v1, v26, s[16:19], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v0, v26, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v7, v26, s[16:19], 0 offen offset:108
+; CHECK-NEXT:    buffer_load_dword v6, v26, s[16:19], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:31
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:22
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:21
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:20
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:19
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:18
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:17
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:25
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:24
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:45
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:37
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:36
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:35
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:34
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:47
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:29
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:44
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:33
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:32
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:40
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:39
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:38
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:41
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:59
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:51
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:50
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:63
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:46
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:61
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:43
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:58
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:49
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:48
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:56
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:54
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:53
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:52
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:55
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:73
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:65
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:64
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:62
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:77
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:60
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:75
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:57
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:72
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:70
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:68
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:67
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:66
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:79
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:69
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:87
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:76
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:91
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:74
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:89
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:71
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:86
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:78
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:93
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:90
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:88
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:85
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:95
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:92
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:94
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:84
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:82
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:81
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:80
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:111
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:110
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:109
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:99
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v25, s1
+; CHECK-NEXT:    v_mov_b32_e32 v24, s0
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:112
+; CHECK-NEXT:    buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:107
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:105
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:104
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:103
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:106
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:102
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:101
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:100
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115
+; CHECK-NEXT:    buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:108
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:98
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:97
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:96
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:127
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:126
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:96
+; CHECK-NEXT:    buffer_load_dword v4, v26, s[16:19], 0 offen
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v5, v26, s[16:19], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[20:23] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[16:19] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[12:15] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:125
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:124
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:123
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:122
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:121
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:120
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:119
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:118
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:117
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:116
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:115
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:114
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:113
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:112
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
@@ -972,279 +263,27 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
 ; CHECK-LABEL: memcpy_p0_p3_minsize:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:112
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:113
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:114
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:115
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:116
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:117
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:118
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:119
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:112
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:113
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:114
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:115
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:116
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:117
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:118
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:119
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:120
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:121
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:122
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:123
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:124
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:125
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:126
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:127
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:120
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:121
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:122
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:123
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:124
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:125
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:126
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:127
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:96
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:97
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:98
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:99
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:100
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:101
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:102
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:103
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:96
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:97
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:98
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:99
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:100
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:101
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:102
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:103
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:104
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:105
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:106
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:107
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:108
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:109
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:110
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:111
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:104
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:105
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:106
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:107
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:108
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:109
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:110
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:111
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:80
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:81
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:82
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:83
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:84
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:85
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:86
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:87
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:80
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:81
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:82
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:84
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:85
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:86
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:87
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:88
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:89
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:90
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:91
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:92
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:93
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:94
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:95
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:88
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:89
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:90
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:91
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:92
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:93
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:94
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:95
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:64
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:65
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:66
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:67
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:68
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:69
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:70
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:71
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:64
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:65
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:66
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:67
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:68
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:69
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:70
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:71
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:72
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:73
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:74
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:75
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:76
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:77
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:78
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:79
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:72
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:73
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:74
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:75
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:76
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:77
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:78
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:79
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:48
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:49
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:50
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:51
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:52
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:53
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:54
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:55
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:48
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:49
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:50
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:51
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:52
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:53
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:54
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:55
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:56
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:57
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:58
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:59
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:60
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:61
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:62
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:63
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:56
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:57
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:58
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:59
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:60
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:61
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:62
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:63
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:32
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:33
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:34
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:35
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:36
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:37
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:38
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:39
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:32
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:33
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:34
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:35
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:36
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:37
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:38
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:39
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:40
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:41
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:42
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:43
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:44
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:45
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:46
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:47
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:40
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:41
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:42
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:43
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:44
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:45
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:46
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:47
-; CHECK-NEXT:    ds_read_u8 v3, v2
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v18, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:17
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:18
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v26, v2 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:23
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:29
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:31
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:27
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:31
-; CHECK-NEXT:    flat_store_byte v[0:1], v3
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:2
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:3
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:4
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:5
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:6
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:9
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT:    v_mov_b32_e32 v16, 0
+; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:4 offset1:5
+; CHECK-NEXT:    ds_read2_b64 v[12:15], v16 offset0:6 offset1:7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v21, s1
+; CHECK-NEXT:    v_mov_b32_e32 v20, s0
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[4:7] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:32
+; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
+; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
+; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
+; CHECK-NEXT:    ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[12:15] offset:48
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[4:7] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:96
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[16:19] offset:112
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
@@ -1256,108 +295,21 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1]
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:1
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:2
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:3
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:4
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:5
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:6
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:8
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:9
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:10
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:11
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:12
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:13
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:14
-; CHECK-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    v_mov_b32_e32 v12, s3
+; CHECK-NEXT:    v_mov_b32_e32 v11, s2
+; CHECK-NEXT:    flat_load_ubyte v13, v[11:12] offset:46
+; CHECK-NEXT:    flat_load_ushort v14, v[11:12] offset:44
+; CHECK-NEXT:    flat_load_dwordx3 v[8:10], v[11:12] offset:32
+; CHECK-NEXT:    flat_load_dwordx4 v[0:3], v[11:12] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[11:12]
+; CHECK-NEXT:    v_mov_b32_e32 v12, s1
+; CHECK-NEXT:    v_mov_b32_e32 v11, s0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:1
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:2
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:3
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:4
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:5
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:6
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:7
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:8
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:9
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:10
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:11
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:12
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:13
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:14
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:17
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[0:1] offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:30
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:29
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:28
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:27
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:26
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:25
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:24
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:23
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:22
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:21
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:20
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:19
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:18
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:17
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:16
-; CHECK-NEXT:    flat_store_byte v[2:3], v19 offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:46
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:45
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:44
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:43
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:42
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:41
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:40
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:39
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:38
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:37
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:36
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:35
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:34
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:33
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_load_ubyte v0, v[0:1] offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:46
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:45
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:44
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:43
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:42
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:41
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:40
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:39
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:38
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:37
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:36
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:35
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:34
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:33
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:32
-; CHECK-NEXT:    flat_store_byte v[2:3], v0 offset:31
+; CHECK-NEXT:    flat_store_byte v[11:12], v13 offset:46
+; CHECK-NEXT:    flat_store_short v[11:12], v14 offset:44
+; CHECK-NEXT:    flat_store_dwordx3 v[11:12], v[8:10] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[0:3] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -1431,375 +383,59 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v24, 0
 ; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:15
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:14
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:13
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:12
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:11
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:10
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:9
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:8
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:7
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:6
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:5
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:4
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:3
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:2
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:1
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1]
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:31
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s2
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:10
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:23
+; CHECK-NEXT:    v_mov_b32_e32 v25, s2
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    buffer_store_dword v3, v25, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_store_dword v2, v25, s[16:19], 0 offen offset:120
+; CHECK-NEXT:    buffer_store_dword v1, v25, s[16:19], 0 offen offset:116
+; CHECK-NEXT:    buffer_store_dword v0, v25, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    buffer_store_dword v7, v25, s[16:19], 0 offen offset:108
+; CHECK-NEXT:    buffer_store_dword v6, v25, s[16:19], 0 offen offset:104
+; CHECK-NEXT:    buffer_store_dword v5, v25, s[16:19], 0 offen offset:100
+; CHECK-NEXT:    buffer_store_dword v4, v25, s[16:19], 0 offen offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1]
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    buffer_store_dword v11, v25, s[16:19], 0 offen offset:92
+; CHECK-NEXT:    buffer_store_dword v10, v25, s[16:19], 0 offen offset:88
+; CHECK-NEXT:    buffer_store_dword v9, v25, s[16:19], 0 offen offset:84
+; CHECK-NEXT:    buffer_store_dword v8, v25, s[16:19], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(16)
+; CHECK-NEXT:    buffer_store_dword v15, v25, s[16:19], 0 offen offset:76
+; CHECK-NEXT:    buffer_store_dword v14, v25, s[16:19], 0 offen offset:72
+; CHECK-NEXT:    buffer_store_dword v13, v25, s[16:19], 0 offen offset:68
+; CHECK-NEXT:    buffer_store_dword v12, v25, s[16:19], 0 offen offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:9
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:8
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:7
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:20
+; CHECK-NEXT:    buffer_store_dword v19, v25, s[16:19], 0 offen offset:60
+; CHECK-NEXT:    buffer_store_dword v18, v25, s[16:19], 0 offen offset:56
+; CHECK-NEXT:    buffer_store_dword v17, v25, s[16:19], 0 offen offset:52
+; CHECK-NEXT:    buffer_store_dword v16, v25, s[16:19], 0 offen offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:6
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:5
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:2
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:47
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:4
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:17
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:3
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:16
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:27
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:26
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:25
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:24
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:45
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:44
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:43
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:23
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:36
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:22
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:35
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:21
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:34
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:20
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:33
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:19
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:28
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:29
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:42
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:18
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:63
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:16
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:61
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:27
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:40
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:26
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:39
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:25
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:38
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:24
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:37
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:44
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:57
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:43
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:56
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:45
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:58
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:36
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:49
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:35
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:48
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:46
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:47
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:60
-; CHECK-NEXT:    s_waitcnt vmcnt(33)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:34
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:79
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:28
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:41
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:42
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:55
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:33
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:32
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:77
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:61
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:74
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:40
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:53
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:39
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:52
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:38
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:51
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:37
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:50
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:57
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:70
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:56
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:69
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:58
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:71
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:49
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:48
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:93
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:46
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:59
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:60
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:73
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:41
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:54
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:55
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:68
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:74
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:87
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:53
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:66
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:52
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:65
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:51
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:64
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:62
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:63
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:76
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:50
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:95
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:77
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:90
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:71
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:83
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:70
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:69
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:59
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:72
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:73
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:85
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:54
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:67
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:68
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:81
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:66
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:111
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:65
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:110
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:64
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:109
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:62
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:75
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:76
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:89
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:90
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:103
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:72
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:86
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:84
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:82
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:87
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:100
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:67
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:80
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:78
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:94
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:79
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:92
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:95
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:108
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:93
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:75
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:88
+; CHECK-NEXT:    buffer_store_dword v23, v25, s[16:19], 0 offen offset:44
+; CHECK-NEXT:    buffer_store_dword v22, v25, s[16:19], 0 offen offset:40
+; CHECK-NEXT:    buffer_store_dword v21, v25, s[16:19], 0 offen offset:36
+; CHECK-NEXT:    buffer_store_dword v20, v25, s[16:19], 0 offen offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    buffer_store_dword v3, v25, s[16:19], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v2, v25, s[16:19], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v1, v25, s[16:19], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v0, v25, s[16:19], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:89
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:102
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:78
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:91
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:94
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:107
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:92
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:105
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:88
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:101
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:91
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:104
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:86
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:85
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:84
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:83
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:82
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:96
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:97
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:98
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:99
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:120
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:81
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:80
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:111
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:110
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:109
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:108
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:100
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:121
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:122
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:123
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:124
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:125
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:126
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:107
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:127
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:105
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:103
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:102
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:101
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:116
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:117
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:119
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:114
-; CHECK-NEXT:    s_waitcnt vmcnt(34)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:104
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:118
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:115
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:113
-; CHECK-NEXT:    global_load_ubyte v21, v0, s[0:1] offset:112
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:99
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:98
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:97
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:96
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:127
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:126
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:125
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:124
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:123
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:122
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:121
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:120
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:119
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:118
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:117
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:116
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:115
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:114
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:113
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v21, v1, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    buffer_store_dword v7, v25, s[16:19], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v6, v25, s[16:19], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v25, s[16:19], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v4, v25, s[16:19], 0 offen
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -1815,363 +451,57 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2
+; CHECK-NEXT:    v_mov_b32_e32 v26, s0
+; CHECK-NEXT:    buffer_load_dword v3, v26, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v2, v26, s[16:19], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v1, v26, s[16:19], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v0, v26, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v7, v26, s[16:19], 0 offen offset:108
+; CHECK-NEXT:    buffer_load_dword v6, v26, s[16:19], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:31
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:22
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:21
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:20
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:19
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:18
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:17
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:25
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:24
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:45
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:37
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:36
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:35
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:34
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:47
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:29
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:44
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:33
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:32
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:40
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:39
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:38
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:41
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:59
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:51
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:50
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:63
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:46
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:61
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:43
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:58
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:49
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:48
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:56
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:54
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:53
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:52
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:55
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:73
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:65
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:64
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:62
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:77
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:60
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:75
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:57
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:72
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:70
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:68
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:67
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:66
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:79
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:69
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:87
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:76
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:91
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:74
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:89
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:71
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:86
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:78
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:93
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:90
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:88
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:85
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:95
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:92
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:94
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:84
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:82
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:81
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:80
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:111
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:110
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:109
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:99
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v25, s1
+; CHECK-NEXT:    v_mov_b32_e32 v24, s0
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:112
+; CHECK-NEXT:    buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:107
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:105
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:104
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:103
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:106
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:102
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:101
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:100
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115
+; CHECK-NEXT:    buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:108
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:98
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:97
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:96
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:127
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:126
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:96
+; CHECK-NEXT:    buffer_load_dword v4, v26, s[16:19], 0 offen
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v5, v26, s[16:19], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[20:23] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[16:19] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[12:15] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:125
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:124
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:123
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:122
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:121
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:120
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:119
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:118
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:117
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:116
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:115
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:114
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:113
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:112
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
@@ -2218,279 +548,27 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
 ; CHECK-LABEL: memcpy_p0_p3_optsize:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:112
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:113
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:114
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:115
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:116
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:117
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:118
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:119
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:112
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:113
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:114
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:115
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:116
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:117
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:118
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:119
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:120
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:121
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:122
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:123
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:124
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:125
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:126
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:127
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:120
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:121
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:122
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:123
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:124
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:125
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:126
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:127
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:96
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:97
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:98
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:99
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:100
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:101
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:102
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:103
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:96
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:97
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:98
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:99
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:100
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:101
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:102
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:103
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:104
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:105
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:106
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:107
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:108
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:109
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:110
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:111
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:104
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:105
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:106
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:107
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:108
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:109
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:110
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:111
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:80
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:81
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:82
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:83
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:84
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:85
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:86
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:87
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:80
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:81
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:82
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:84
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:85
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:86
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:87
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:88
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:89
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:90
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:91
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:92
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:93
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:94
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:95
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:88
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:89
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:90
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:91
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:92
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:93
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:94
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:95
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:64
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:65
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:66
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:67
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:68
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:69
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:70
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:71
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:64
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:65
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:66
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:67
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:68
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:69
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:70
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:71
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:72
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:73
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:74
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:75
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:76
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:77
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:78
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:79
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:72
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:73
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:74
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:75
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:76
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:77
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:78
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:79
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:48
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:49
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:50
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:51
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:52
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:53
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:54
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:55
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:48
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:49
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:50
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:51
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:52
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:53
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:54
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:55
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:56
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:57
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:58
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:59
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:60
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:61
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:62
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:63
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:56
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:57
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:58
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:59
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:60
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:61
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:62
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:63
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:32
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:33
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:34
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:35
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:36
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:37
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:38
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:39
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:32
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:33
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:34
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:35
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:36
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:37
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:38
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:39
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:40
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:41
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:42
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:43
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:44
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:45
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:46
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:47
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:40
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:41
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:42
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:43
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:44
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:45
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:46
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:47
-; CHECK-NEXT:    ds_read_u8 v3, v2
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v18, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:17
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:18
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v26, v2 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:23
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:29
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:31
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:27
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:31
-; CHECK-NEXT:    flat_store_byte v[0:1], v3
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:2
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:3
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:4
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:5
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:6
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:9
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT:    v_mov_b32_e32 v16, 0
+; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:4 offset1:5
+; CHECK-NEXT:    ds_read2_b64 v[12:15], v16 offset0:6 offset1:7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v21, s1
+; CHECK-NEXT:    v_mov_b32_e32 v20, s0
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[4:7] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:32
+; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
+; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
+; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
+; CHECK-NEXT:    ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[12:15] offset:48
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[4:7] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:96
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[16:19] offset:112
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
index 7575782c1b2acd..cadc3dadb0a1e9 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
@@ -13,55 +13,9 @@ define void @memcpy_p0_p0_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -73,101 +27,19 @@ define void @memcpy_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xe
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -179,104 +51,13 @@ define void @memcpy_p0_p0_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:31
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -288,31 +69,9 @@ define void @memcpy_p0_p0_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -324,55 +83,19 @@ define void @memcpy_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -384,55 +107,13 @@ define void @memcpy_p0_p0_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -458,58 +139,13 @@ define void @memcpy_p0_p0_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -553,58 +189,13 @@ define void @memcpy_p0_p0_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p0_p0_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -634,55 +225,9 @@ define void @memcpy_p0_p1_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -694,101 +239,19 @@ define void @memcpy_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xe
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -800,104 +263,13 @@ define void @memcpy_p0_p1_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:31
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -909,31 +281,9 @@ define void @memcpy_p0_p1_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:2
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -945,55 +295,19 @@ define void @memcpy_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1005,55 +319,13 @@ define void @memcpy_p0_p1_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1080,35 +352,12 @@ define void @memcpy_p0_p1_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x1
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v8 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v9 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v10 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v11 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v11
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 8, v11
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1153,35 +402,12 @@ define void @memcpy_p0_p1_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x1
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v8 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v9 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v10 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v11 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v11
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 8, v11
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1211,54 +437,9 @@ define void @memcpy_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1270,96 +451,19 @@ define void @memcpy_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v17, v2
-; CHECK-NEXT:    ds_read_u8 v18, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:17
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:29
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:23
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:29
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:27
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:25
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:21
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:19
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
+; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
+; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1371,100 +475,12 @@ define void @memcpy_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v18, v2
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:17
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:31
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:29
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:23
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:31
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1476,30 +492,9 @@ define void @memcpy_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v3, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1511,54 +506,19 @@ define void @memcpy_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:28
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:24
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:22
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:20
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:18
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:16
-; CHECK-NEXT:    ds_read_u16 v11, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v12, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v14, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v15, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v16, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v17, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
+; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
+; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1570,54 +530,12 @@ define void @memcpy_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v3, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:28
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:24
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:22
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:20
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:18
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:16
-; CHECK-NEXT:    ds_read_u16 v11, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v12, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v14, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v15, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v16, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v17, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1643,35 +561,12 @@ define void @memcpy_p0_p3_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset1:1
-; CHECK-NEXT:    ds_read_b128 v[7:10], v2 offset:15
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:15
+; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v7 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v8 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v9 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v10 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1714,35 +609,12 @@ define void @memcpy_p0_p3_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p0_p3_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b128 v[3:6], v2
-; CHECK-NEXT:    ds_read_b128 v[7:10], v2 offset:15
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:15
+; CHECK-NEXT:    ds_read_b128 v[7:10], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v7 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v8 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v9 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v10 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1771,55 +643,12 @@ define void @memcpy_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1831,100 +660,24 @@ define void @memcpy_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:2
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:3
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:4
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:5
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:6
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:7
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:8
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:9
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:10
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:11
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:12
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:13
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
+; CHECK-NEXT:    global_load_dword v4, v[2:3], off offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dword v[0:1], v4 offset:24
+; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
+; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
+; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:30
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1936,104 +689,18 @@ define void @memcpy_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:31
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2045,30 +712,12 @@ define void @memcpy_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:2
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:4
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:6
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:8
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:10
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:10
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:12
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2 offset:14
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2080,55 +729,24 @@ define void @memcpy_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
+; CHECK-NEXT:    global_load_dword v4, v[2:3], off offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dword v[0:1], v4 offset:24
+; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
+; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:30
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2140,55 +758,18 @@ define void @memcpy_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2219,30 +800,7 @@ define void @memcpy_p0_p4_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v2 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v3 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v4 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v5 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2290,30 +848,7 @@ define void @memcpy_p0_p4_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v2 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v3 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v4 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v5 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2342,55 +877,13 @@ define void @memcpy_p0_p5_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2402,99 +895,23 @@ define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x11
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17
-; CHECK-NEXT:    s_clause 0xc
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:24
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:19
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2506,103 +923,19 @@ define void @memcpy_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x11
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    s_clause 0xd
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2614,31 +947,13 @@ define void @memcpy_p0_p5_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:2
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2650,55 +965,23 @@ define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2710,55 +993,19 @@ define void @memcpy_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2788,53 +1035,19 @@ define void @memcpy_p0_p5_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:15
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:23
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:27
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2888,53 +1101,19 @@ define void @memcpy_p0_p5_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p0_p5_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:15
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:23
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:27
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2970,41 +1149,8 @@ define void @memcpy_p1_p0_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v10
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v11, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 8, v17
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v18, 8, v2
-; CHECK-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v7
-; CHECK-NEXT:    v_lshl_or_b32 v4, v10, 16, v9
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3016,79 +1162,15 @@ define void @memcpy_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v9
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v10, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v7, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v15, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v7, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v19, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v21, 8, v22
-; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v23, 8, v24
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v25, 8, v26
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v27, 8, v28
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v30
-; CHECK-NEXT:    v_lshl_or_b32 v4, v14, 16, v12
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[6:7], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v31, 8, v32
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v33, 8, v2
-; CHECK-NEXT:    v_lshl_or_b32 v3, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v2, v2, 16, v18
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3100,79 +1182,13 @@ define void @memcpy_p1_p0_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:31
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v34, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v10
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; CHECK-NEXT:    v_lshl_or_b32 v6, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v14, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v18, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v20, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v22, 8, v23
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v24, 8, v25
-; CHECK-NEXT:    v_lshl_or_b32 v9, v12, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v26, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v28, 8, v29
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v30, 8, v31
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v32, 8, v33
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v34, 8, v2
-; CHECK-NEXT:    v_lshl_or_b32 v2, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v6, v19, 16, v18
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -3183,23 +1199,8 @@ define void @memcpy_p1_p0_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3]
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v9, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v6
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v7
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3211,52 +1212,16 @@ define void @memcpy_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ushort v19, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v20, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v21, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v22, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v23, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v4, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v17, 8, v18
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v19, 16, v20
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[6:7], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v21, 16, v22
-; CHECK-NEXT:    v_lshl_or_b32 v9, v13, 16, v12
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v23, 16, v2
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -3267,39 +1232,13 @@ define void @memcpy_p1_p0_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v19, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 16, v9
-; CHECK-NEXT:    v_lshl_or_b32 v9, v7, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v12, 16, v13
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v14, 16, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v16, 16, v17
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v18, 16, v19
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -3323,47 +1262,13 @@ define void @memcpy_p1_p0_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v7, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v9, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v7, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v17, 8, v18
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v19, 8, v20
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -3404,47 +1309,13 @@ define void @memcpy_p1_p0_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p1_p0_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v7, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v9, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v7, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v17, 8, v18
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v19, 8, v20
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -4042,44 +1913,13 @@ define void @memcpy_p1_p5_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v16, 8, v15
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v2, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -4090,81 +1930,21 @@ define void @memcpy_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v23, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v25
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v26, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v28, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v29
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v31, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v7, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v2, 8, v32
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v6, v14, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v8, v19, 16, v18
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[9:10], off offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -4175,81 +1955,19 @@ define void @memcpy_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v23, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v26, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v28, 8, v27
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v29
-; CHECK-NEXT:    v_lshl_or_b32 v7, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v32, 8, v31
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v2, 8, v33
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v6, v14, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v8, v19, 16, v18
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -4260,24 +1978,13 @@ define void @memcpy_p1_p5_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v7
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v10, 16, v9
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -4288,52 +1995,21 @@ define void @memcpy_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v19, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v20, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v21, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v22, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v14, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v16, 8, v15
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v18, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 16, v10
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v20, 16, v19
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v22, 16, v21
-; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 16, v11
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[9:10], off offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4344,41 +2020,19 @@ define void @memcpy_p1_p5_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 16, v15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v18, 16, v17
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -4406,49 +2060,18 @@ define void @memcpy_p1_p5_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v10, v2, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v20, 8, v19
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 16, v8
-; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 16, v11
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4500,49 +2123,18 @@ define void @memcpy_p1_p5_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p1_p5_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v10, v2, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v20, 8, v19
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 16, v8
-; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 16, v11
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4577,41 +2169,8 @@ define void @memcpy_p3_p0_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v9
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 8, v16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 8, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
 ; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -4624,80 +2183,16 @@ define void @memcpy_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v8
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v20, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v22, 8, v23
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v24, 8, v25
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v26, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v28, 8, v29
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[7:8], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v30, 8, v31
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v32, 8, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v6, v10, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v11
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:23
-; CHECK-NEXT:    ds_write_b64 v0, v[3:4] offset:16
-; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4709,79 +2204,13 @@ define void @memcpy_p3_p0_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:31
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v33, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v9
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v17, 8, v18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v19, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v21, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v23, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v25, 8, v26
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v28
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v29, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v31, 8, v32
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v33, 8, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v6, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
-; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4793,23 +2222,8 @@ define void @memcpy_p3_p0_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2]
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v8, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v5
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v6
 ; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -4822,51 +2236,16 @@ define void @memcpy_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v18, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v19, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v20, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v21, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v22, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v7, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v10, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v11, v5, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v15, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v17, 16, v18
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[7:8], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v19, 16, v20
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v21, 16, v22
-; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:16
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:23
+; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4878,40 +2257,13 @@ define void @memcpy_p3_p0_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v18, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v13, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v18
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
-; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4937,47 +2289,13 @@ define void @memcpy_p3_p0_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v8, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v7, v6, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v14, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v16, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v18, 8, v19
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2]
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2] offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v11
-; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b128 v0, v[5:8] offset:15
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write_b128 v0, v[7:10] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5021,47 +2339,13 @@ define void @memcpy_p3_p0_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p3_p0_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v8, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v7, v6, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v14, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v16, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v18, 8, v19
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v11
-; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_write_b128 v0, v[1:4]
-; CHECK-NEXT:    ds_write_b128 v0, v[5:8] offset:15
+; CHECK-NEXT:    ds_write_b128 v0, v[3:6] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write_b128 v0, v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5701,44 +2985,13 @@ define void @memcpy_p3_p5_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v15, 8, v14
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v1, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5750,81 +3003,21 @@ define void @memcpy_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v15, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v20, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v18, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v23
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v26
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v28
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v1, 8, v31
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v7, v18, 16, v17
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
-; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:23
+; CHECK-NEXT:    ds_write_b64 v0, v[6:7] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b64 v0, v[8:9] offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5836,81 +3029,19 @@ define void @memcpy_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v15, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v20, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v18, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v26
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v28
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v31, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v1, 8, v32
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v7, v18, 16, v17
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5922,24 +3053,13 @@ define void @memcpy_p3_p5_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5951,52 +3071,21 @@ define void @memcpy_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v18, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v19, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v20, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v21, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v13, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v15, 8, v14
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v17, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v19, 16, v18
+; CHECK-NEXT:    ds_write_b64 v0, v[6:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v21, 16, v20
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 16, v10
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
-; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:23
+; CHECK-NEXT:    ds_write_b64 v0, v[8:9] offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -6008,41 +3097,19 @@ define void @memcpy_p3_p5_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v17, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 16, v10
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
+; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -6072,50 +3139,19 @@ define void @memcpy_p3_p5_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v9, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v4, v1, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v15, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v2, v11, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v18
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v1, v13, 16, v12
+; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset1:1
-; CHECK-NEXT:    ds_write_b128 v0, v[1:4] offset:15
+; CHECK-NEXT:    ds_write_b128 v0, v[2:5] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -6169,49 +3205,18 @@ define void @memcpy_p3_p5_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p3_p5_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v9, v1, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v15, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v18
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 16, v7
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ds_write_b128 v0, v[2:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ds_write_b128 v0, v[6:9] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -6248,55 +3253,12 @@ define void @memcpy_p5_p0_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -6307,101 +3269,24 @@ define void @memcpy_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xe
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -6412,104 +3297,19 @@ define void @memcpy_p5_p0_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:31
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -6520,31 +3320,12 @@ define void @memcpy_p5_p0_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -6555,55 +3336,24 @@ define void @memcpy_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -6614,55 +3364,19 @@ define void @memcpy_p5_p0_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -6689,61 +3403,19 @@ define void @memcpy_p5_p0_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -6793,61 +3465,19 @@ define void @memcpy_p5_p0_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p5_p0_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -6881,55 +3511,12 @@ define void @memcpy_p5_p1_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -6940,207 +3527,47 @@ define void @memcpy_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
   ret void
-}
-
-define void @memcpy_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
-; CHECK-LABEL: memcpy_p5_p1_sz32_align_1_1:
-; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:31
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v33, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:18
+}
+
+define void @memcpy_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p1_sz32_align_1_1:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -7151,31 +3578,12 @@ define void @memcpy_p5_p1_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -7186,55 +3594,24 @@ define void @memcpy_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -7245,55 +3622,19 @@ define void @memcpy_p5_p1_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -7329,30 +3670,10 @@ define void @memcpy_p5_p1_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v10
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -7411,30 +3732,10 @@ define void @memcpy_p5_p1_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v10
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -7468,54 +3769,12 @@ define void @memcpy_p5_p3_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:15
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:7
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -7526,85 +3785,25 @@ define void @memcpy_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:7
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v17, v1 offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:24
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:25
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u8 v18, v1 offset:27
-; CHECK-NEXT:    ds_read_u8 v19, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v20, v1 offset:29
-; CHECK-NEXT:    ds_read_u8 v21, v1 offset:30
-; CHECK-NEXT:    ds_read_u8 v22, v1 offset:16
-; CHECK-NEXT:    ds_read_u8 v23, v1 offset:17
-; CHECK-NEXT:    ds_read_u8 v24, v1 offset:18
-; CHECK-NEXT:    ds_read_u8 v25, v1 offset:19
-; CHECK-NEXT:    ds_read_u8 v26, v1 offset:20
-; CHECK-NEXT:    ds_read_u8 v27, v1 offset:21
-; CHECK-NEXT:    ds_read_u8 v28, v1 offset:22
-; CHECK-NEXT:    ds_read_u8 v1, v1 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
+; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
+; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -7615,79 +3814,18 @@ define void @memcpy_p5_p3_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:15
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v10, v1
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v17, v1 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:24
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:25
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:27
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v18, v1 offset:29
-; CHECK-NEXT:    ds_read_u8 v19, v1 offset:30
-; CHECK-NEXT:    ds_read_u8 v20, v1 offset:31
-; CHECK-NEXT:    ds_read_u8 v21, v1 offset:16
-; CHECK-NEXT:    ds_read_u8 v22, v1 offset:17
-; CHECK-NEXT:    ds_read_u8 v23, v1 offset:18
-; CHECK-NEXT:    ds_read_u8 v24, v1 offset:19
-; CHECK-NEXT:    ds_read_u8 v25, v1 offset:20
-; CHECK-NEXT:    ds_read_u8 v26, v1 offset:21
-; CHECK-NEXT:    ds_read_u8 v27, v1 offset:22
-; CHECK-NEXT:    ds_read_u8 v1, v1 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[6:9], v1 offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -7698,30 +3836,12 @@ define void @memcpy_p5_p3_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v2, v1
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v1, v1 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -7732,54 +3852,25 @@ define void @memcpy_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:30
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:28
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:22
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:20
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:18
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:16
-; CHECK-NEXT:    ds_read_u16 v10, v1 offset:14
-; CHECK-NEXT:    ds_read_u16 v11, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v12, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v13, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v14, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v15, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v16, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
+; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
+; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -7790,54 +3881,18 @@ define void @memcpy_p5_p3_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v2, v1 offset:30
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:28
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:22
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:20
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:18
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:16
-; CHECK-NEXT:    ds_read_u16 v10, v1 offset:14
-; CHECK-NEXT:    ds_read_u16 v11, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v12, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v13, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v14, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v15, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v16, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[6:9], v1 offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -7872,30 +3927,10 @@ define void @memcpy_p5_p3_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v6
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v6
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -7952,30 +3987,10 @@ define void @memcpy_p5_p3_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v6
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v6
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -8008,55 +4023,12 @@ define void @memcpy_p5_p4_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -8067,100 +4039,24 @@ define void @memcpy_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -8171,103 +4067,19 @@ define void @memcpy_p5_p4_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:31
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v33, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -8278,31 +4090,12 @@ define void @memcpy_p5_p4_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -8313,55 +4106,24 @@ define void @memcpy_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -8372,55 +4134,19 @@ define void @memcpy_p5_p4_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -8456,30 +4182,10 @@ define void @memcpy_p5_p4_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v10
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -8538,30 +4244,10 @@ define void @memcpy_p5_p4_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v10
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -8595,55 +4281,19 @@ define void @memcpy_p5_p5_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -8654,99 +4304,34 @@ define void @memcpy_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x11
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_clause 0xc
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -8757,103 +4342,31 @@ define void @memcpy_p5_p5_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x11
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_clause 0xd
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -8864,31 +4377,19 @@ define void @memcpy_p5_p5_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -8899,55 +4400,34 @@ define void @memcpy_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -8958,55 +4438,31 @@ define void @memcpy_p5_p5_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -9040,67 +4496,31 @@ define void @memcpy_p5_p5_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_dword v16, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v17, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v18, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v19, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:19
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -9169,67 +4589,31 @@ define void @memcpy_p5_p5_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p5_p5_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_dword v16, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v17, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v18, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v19, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:19
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
index cc5256620bfe08..4e5688adcd6bbd 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
@@ -13,55 +13,9 @@ define void @memmove_p0_p0_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -73,100 +27,19 @@ define void @memmove_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(30) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -178,103 +51,13 @@ define void @memmove_p0_p0_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:31
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v34, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(31) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v34 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -286,31 +69,9 @@ define void @memmove_p0_p0_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -322,55 +83,19 @@ define void @memmove_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -382,55 +107,13 @@ define void @memmove_p0_p0_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -559,55 +242,9 @@ define void @memmove_p0_p1_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -619,100 +256,19 @@ define void @memmove_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v20, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v21, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v22, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v23, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v24, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v25, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v26, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v27, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v28, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v29, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v30, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v31, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v32, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v33, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:3
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:2
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:1
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -724,103 +280,13 @@ define void @memmove_p0_p1_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:31
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v20, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v21, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v22, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v23, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v24, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v25, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v26, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v27, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v28, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v29, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v30, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v31, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v32, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v33, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v34, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v34 offset:1
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -832,31 +298,9 @@ define void @memmove_p0_p1_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:2
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -868,55 +312,19 @@ define void @memmove_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -928,55 +336,13 @@ define void @memmove_p0_p1_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1105,54 +471,9 @@ define void @memmove_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1164,72 +485,19 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:29
 ; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:17
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:18
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:23
-; CHECK-NEXT:    ds_read_u8 v18, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v26, v2
-; CHECK-NEXT:    ds_read_u8 v27, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v28, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v29, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v30, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v31, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v32, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
+; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
+; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:27
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:23
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:15
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:9
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:7
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:6
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:5
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:4
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:3
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:2
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:1
-; CHECK-NEXT:    flat_store_byte v[0:1], v26
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1241,74 +509,12 @@ define void @memmove_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:29
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:31
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:17
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:18
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v18, v2 offset:23
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v26, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v27, v2
-; CHECK-NEXT:    ds_read_u8 v28, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v29, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v30, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v31, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v32, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v33, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:31
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:27
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:23
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:15
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:9
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:7
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:6
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:5
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:4
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:3
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:2
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:1
-; CHECK-NEXT:    flat_store_byte v[0:1], v27
+; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1320,30 +526,9 @@ define void @memmove_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v3, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1355,54 +540,19 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:28
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:24
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:22
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:20
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:18
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:16
-; CHECK-NEXT:    ds_read_u16 v11, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v12, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v14, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v15, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v16, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v17, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
+; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
+; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1414,54 +564,12 @@ define void @memmove_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v3, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:28
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:24
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:22
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:20
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:18
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:16
-; CHECK-NEXT:    ds_read_u16 v11, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v12, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v14, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v15, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v16, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v17, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1583,55 +691,9 @@ define void @memmove_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1643,100 +705,19 @@ define void @memmove_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v20, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v21, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v22, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v23, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v24, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v25, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v26, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v27, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v28, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v29, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v30, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v31, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v32, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v33, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:3
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:2
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:1
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1748,103 +729,13 @@ define void @memmove_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:31
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v20, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v21, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v22, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v23, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v24, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v25, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v26, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v27, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v28, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v29, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v30, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v31, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v32, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v33, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v34, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v34 offset:1
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1856,31 +747,9 @@ define void @memmove_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:2
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1892,55 +761,19 @@ define void @memmove_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1952,55 +785,13 @@ define void @memmove_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2129,55 +920,13 @@ define void @memmove_p0_p5_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2189,100 +938,23 @@ define void @memmove_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:6
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:5
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:1
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2294,103 +966,19 @@ define void @memmove_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:5
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:1
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2402,31 +990,13 @@ define void @memmove_p0_p5_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:2
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2438,55 +1008,23 @@ define void @memmove_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2498,55 +1036,19 @@ define void @memmove_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2698,41 +1200,8 @@ define void @memmove_p1_p0_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v10
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v11, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 8, v17
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v18, 8, v2
-; CHECK-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v7
-; CHECK-NEXT:    v_lshl_or_b32 v4, v10, 16, v9
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2744,79 +1213,18 @@ define void @memmove_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v8, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(23)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshlrev_b16 v12, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v14, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(17)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v16, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v16, v6, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v18, 8, v19
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v20, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v8, v14, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v22, 8, v23
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v24, 8, v25
-; CHECK-NEXT:    v_lshl_or_b32 v6, v15, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v20, v26, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v28, 8, v29
-; CHECK-NEXT:    v_lshl_or_b32 v4, v18, 16, v17
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v21, v30, 8, v31
+; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v22, v32, 8, v33
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v12, v12, v2
-; CHECK-NEXT:    v_lshl_or_b32 v3, v20, 16, v19
-; CHECK-NEXT:    v_lshl_or_b32 v2, v22, 16, v21
-; CHECK-NEXT:    global_store_byte v[0:1], v13, off offset:30
-; CHECK-NEXT:    global_store_short v[0:1], v12, off offset:28
 ; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2828,79 +1236,13 @@ define void @memmove_p1_p0_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:31
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v34, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v10
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; CHECK-NEXT:    v_lshl_or_b32 v6, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v14, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v18, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v20, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v22, 8, v23
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v24, 8, v25
-; CHECK-NEXT:    v_lshl_or_b32 v9, v12, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v26, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v28, 8, v29
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v30, 8, v31
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v32, 8, v33
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v34, 8, v2
-; CHECK-NEXT:    v_lshl_or_b32 v2, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v6, v19, 16, v18
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -2911,23 +1253,8 @@ define void @memmove_p1_p0_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3]
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v9, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v6
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v7
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2939,41 +1266,18 @@ define void @memmove_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v19, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v6, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v12, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v14, 16, v15
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v16, 16, v17
+; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v18, 16, v19
-; CHECK-NEXT:    global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_byte v[0:1], v20, off offset:30
 ; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2985,39 +1289,13 @@ define void @memmove_p1_p0_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v19, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 16, v9
-; CHECK-NEXT:    v_lshl_or_b32 v9, v7, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v12, 16, v13
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v14, 16, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v16, 16, v17
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v18, 16, v19
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -3783,44 +2061,13 @@ define void @memmove_p1_p5_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v16, 8, v15
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v2, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -3831,82 +2078,24 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshlrev_b16 v4, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
-; CHECK-NEXT:    v_lshl_or_b32 v8, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v15, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v2, v9, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v19, 8, v18
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v21, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v26, 8, v25
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v24, 8, v28
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v27
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v31, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    v_or_b32_e32 v18, v4, v32
-; CHECK-NEXT:    v_lshl_or_b32 v4, v13, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v7, v15, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 16, v17
+; CHECK-NEXT:    global_store_byte v[0:1], v11, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_byte v[0:1], v33, off offset:30
-; CHECK-NEXT:    global_store_short v[0:1], v18, off offset:28
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -3917,81 +2106,19 @@ define void @memmove_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v23, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v26, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v28, 8, v27
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v29
-; CHECK-NEXT:    v_lshl_or_b32 v7, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v32, 8, v31
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v2, 8, v33
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v6, v14, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v8, v19, 16, v18
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -4002,24 +2129,13 @@ define void @memmove_p1_p5_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v7
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v10, 16, v9
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -4030,43 +2146,24 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v18, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v7
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v15, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v17, 16, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    global_store_short v[0:1], v18, off offset:28
+; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
+; CHECK-NEXT:    global_store_byte v[0:1], v11, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_byte v[0:1], v19, off offset:30
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4077,41 +2174,19 @@ define void @memmove_p1_p5_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 16, v15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v18, 16, v17
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -4258,41 +2333,8 @@ define void @memmove_p3_p0_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v9
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 8, v16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 8, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
 ; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -4305,82 +2347,20 @@ define void @memmove_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(27) lgkmcnt(27)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v7, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(23)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshlrev_b16 v11, 8, v11
-; CHECK-NEXT:    v_lshl_or_b32 v4, v3, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(17)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v17, 8, v18
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v19, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v21, 8, v22
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v23, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v25, 8, v26
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v28
-; CHECK-NEXT:    v_lshl_or_b32 v3, v14, 16, v6
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v29, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v31, 8, v32
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v11, v11, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v10, 16, v8
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v5, v18, 16, v17
-; CHECK-NEXT:    ds_write_b8 v0, v12 offset:30
-; CHECK-NEXT:    ds_write_b32 v0, v13 offset:24
-; CHECK-NEXT:    ds_write_b16 v0, v11 offset:28
-; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:16
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write_b8 v0, v8 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    ds_write_b16 v0, v9 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    ds_write_b32 v0, v7 offset:24
+; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4392,79 +2372,13 @@ define void @memmove_p3_p0_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:31
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v33, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v9
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v17, 8, v18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v19, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v21, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v23, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v25, 8, v26
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v28
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v29, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v31, 8, v32
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v33, 8, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v6, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
-; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4476,23 +2390,8 @@ define void @memmove_p3_p0_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2]
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v8, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v5
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v6
 ; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -4505,43 +2404,20 @@ define void @memmove_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v3, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v10, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v12, 16, v13
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v14, 16, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v16, 16, v17
-; CHECK-NEXT:    ds_write_b16 v0, v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
-; CHECK-NEXT:    ds_write_b8 v0, v18 offset:30
-; CHECK-NEXT:    ds_write_b32 v0, v8 offset:24
-; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:16
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write_b8 v0, v8 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    ds_write_b16 v0, v9 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    ds_write_b32 v0, v7 offset:24
+; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4553,40 +2429,13 @@ define void @memmove_p3_p0_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v18, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v13, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v18
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
-; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5398,44 +3247,13 @@ define void @memmove_p3_p5_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v15, 8, v14
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v1, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5447,83 +3265,26 @@ define void @memmove_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshlrev_b16 v3, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v14, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v16, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v16, v2, 16, v1
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v18, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v20, 8, v19
-; CHECK-NEXT:    v_lshl_or_b32 v1, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v22, 8, v21
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    ds_write_b32 v0, v8 offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v23, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v28, 8, v26
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v8
+; CHECK-NEXT:    ds_write_b16 v0, v9 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    ds_write_b8 v0, v10 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v30, 8, v29
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    v_or_b32_e32 v17, v3, v31
-; CHECK-NEXT:    v_lshl_or_b32 v3, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v5, v14, 16, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b8 v0, v32 offset:30
-; CHECK-NEXT:    ds_write_b32 v0, v16 offset:24
-; CHECK-NEXT:    ds_write_b16 v0, v17 offset:28
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b64 v0, v[6:7] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5535,81 +3296,19 @@ define void @memmove_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v15, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v20, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v18, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v26
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v28
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v31, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v1, 8, v32
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v7, v18, 16, v17
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5621,24 +3320,13 @@ define void @memmove_p3_p5_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5650,44 +3338,26 @@ define void @memmove_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
+; CHECK-NEXT:    ds_write_b32 v0, v8 offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
+; CHECK-NEXT:    ds_write_b16 v0, v9 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v12
+; CHECK-NEXT:    ds_write_b8 v0, v10 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v15, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    ds_write_b16 v0, v16 offset:28
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b8 v0, v17 offset:30
-; CHECK-NEXT:    ds_write_b32 v0, v18 offset:24
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT:    ds_write_b64 v0, v[6:7] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5699,41 +3369,19 @@ define void @memmove_p3_p5_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v17, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 16, v10
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
+; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5890,55 +3538,12 @@ define void @memmove_p5_p0_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -5949,100 +3554,24 @@ define void @memmove_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(30) lgkmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28) lgkmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27) lgkmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26) lgkmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21) lgkmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -6053,103 +3582,19 @@ define void @memmove_p5_p0_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:31
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v33, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(31) lgkmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30) lgkmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28) lgkmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27) lgkmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26) lgkmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21) lgkmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -6160,31 +3605,12 @@ define void @memmove_p5_p0_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -6195,55 +3621,24 @@ define void @memmove_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -6254,55 +3649,19 @@ define void @memmove_p5_p0_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -6452,55 +3811,12 @@ define void @memmove_p5_p1_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -6511,100 +3827,24 @@ define void @memmove_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -6615,103 +3855,19 @@ define void @memmove_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:31
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v33, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -6722,31 +3878,12 @@ define void @memmove_p5_p1_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -6757,55 +3894,24 @@ define void @memmove_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -6816,55 +3922,19 @@ define void @memmove_p5_p1_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -7009,54 +4079,12 @@ define void @memmove_p5_p3_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:15
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:7
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -7067,72 +4095,25 @@ define void @memmove_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:24
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:25
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:27
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:29
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:30
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:16
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:17
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:18
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:19
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:20
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:21
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:22
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:23
-; CHECK-NEXT:    ds_read_u8 v17, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v18, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v19, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v20, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v21, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v22, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v23, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v24, v1 offset:15
-; CHECK-NEXT:    ds_read_u8 v25, v1
-; CHECK-NEXT:    ds_read_u8 v26, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v27, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v28, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v29, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v30, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v31, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v1, v1 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen
+; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
+; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
+; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -7143,74 +4124,18 @@ define void @memmove_p5_p3_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:24
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:25
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:27
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:29
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:30
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:31
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:16
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:17
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:18
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:19
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:20
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:21
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:22
-; CHECK-NEXT:    ds_read_u8 v17, v1 offset:23
-; CHECK-NEXT:    ds_read_u8 v18, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v19, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v20, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v21, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v22, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v23, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v24, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v25, v1 offset:15
-; CHECK-NEXT:    ds_read_u8 v26, v1
-; CHECK-NEXT:    ds_read_u8 v27, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v28, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v29, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v30, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v31, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v32, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v1, v1 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[6:9], v1 offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -7221,30 +4146,12 @@ define void @memmove_p5_p3_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v2, v1
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v1, v1 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -7255,54 +4162,25 @@ define void @memmove_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:30
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:28
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:22
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:20
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:18
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:16
-; CHECK-NEXT:    ds_read_u16 v10, v1 offset:14
-; CHECK-NEXT:    ds_read_u16 v11, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v12, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v13, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v14, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v15, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v16, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
+; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
+; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -7313,54 +4191,18 @@ define void @memmove_p5_p3_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v2, v1 offset:30
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:28
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:22
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:20
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:18
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:16
-; CHECK-NEXT:    ds_read_u16 v10, v1 offset:14
-; CHECK-NEXT:    ds_read_u16 v11, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v12, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v13, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v14, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v15, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v16, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[6:9], v1 offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -7505,55 +4347,12 @@ define void @memmove_p5_p4_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -7564,100 +4363,24 @@ define void @memmove_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -7668,103 +4391,19 @@ define void @memmove_p5_p4_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:31
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v33, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -7775,31 +4414,12 @@ define void @memmove_p5_p4_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -7810,55 +4430,24 @@ define void @memmove_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -7869,55 +4458,19 @@ define void @memmove_p5_p4_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -8062,55 +4615,19 @@ define void @memmove_p5_p5_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -8121,100 +4638,34 @@ define void @memmove_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -8225,103 +4676,31 @@ define void @memmove_p5_p5_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -8332,31 +4711,19 @@ define void @memmove_p5_p5_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -8367,55 +4734,34 @@ define void @memmove_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -8426,55 +4772,31 @@ define void @memmove_p5_p5_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)



More information about the llvm-commits mailing list