[llvm] [AMDGPU] Enable unaligned scratch accesses (PR #110219)

Fabian Ritter via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 10 06:51:22 PDT 2024


https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/110219

>From 5c8c35ec8a22868b2b900b45b04263d6f35d53ca Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Fri, 27 Sep 2024 03:25:04 -0400
Subject: [PATCH 1/3] [AMDGPU] Enable unaligned scratch accesses

This allows us to emit wide generic and scratch memory accesses when we do not
have alignment information. In cases where accesses happen to be properly
aligned or where generic accesses do not go to scratch memory, this improves
performance of the generated code by a factor of up to 16x and reduces code
size, especially when lowering memcpy and memmove intrinsics.

Also: Make the use of the FeatureUnalignedScratchAccess feature more
consistent: Code has already assumed that unaligned accesses with the
specialized flat scratch instructions are allowed independent of
FeatureUnalignedScratchAccess at some places. This patch always uses this
interpretation.

Part of SWDEV-455845.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   18 +-
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |    5 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |    4 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   17 +-
 .../AMDGPU/GlobalISel/legalize-load-flat.mir  | 5360 ++------------
 .../GlobalISel/legalize-load-private.mir      | 2696 ++-----
 llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll    |   23 +-
 .../test/CodeGen/AMDGPU/flat-address-space.ll |   12 +-
 .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll |   98 +-
 llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll    | 2438 +-----
 .../AMDGPU/memcpy-param-combinations.ll       | 6516 +++--------------
 .../AMDGPU/memmove-param-combinations.ll      | 5196 ++-----------
 llvm/test/CodeGen/AMDGPU/sdwa-commute.ll      |    4 +-
 13 files changed, 2973 insertions(+), 19414 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 25117544d6a849..69569fbe95b267 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1178,9 +1178,9 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
    FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
    FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
    FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
-   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
-   FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero,
-   FeatureVmemWriteVgprInOrder
+   FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
+   FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS,
+   FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder
   ]
 >;
 
@@ -1199,9 +1199,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
    FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
-   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
-   FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
-   FeatureMaxHardClauseLength63,
+   FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
+   FeatureUnalignedDSAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
+   FeatureDefaultComponentZero, FeatureMaxHardClauseLength63,
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts,
    FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts,
    FeatureVmemWriteVgprInOrder
@@ -1223,9 +1223,9 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
    FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureA16, FeatureFastDenormalF32, FeatureG16,
-   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
-   FeatureGWS, FeatureDefaultComponentZero,
-   FeatureMaxHardClauseLength32,
+   FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
+   FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS,
+   FeatureDefaultComponentZero, FeatureMaxHardClauseLength32,
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
    FeatureVmemWriteVgprInOrder
   ]
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 3f4f42377d56ee..cbf6b7d60f0366 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -387,8 +387,9 @@ bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
   // them later if they may access private memory. We don't have enough context
   // here, and legalization can handle it.
   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
-    return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
-      ChainSizeInBytes <= ST->getMaxPrivateElementSize();
+    return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled() ||
+            ST->enableFlatScratch()) &&
+           ChainSizeInBytes <= ST->getMaxPrivateElementSize();
   }
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 19458126093167..1ea3beb2855d69 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -591,6 +591,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return UnalignedScratchAccess;
   }
 
+  bool hasUnalignedScratchAccessEnabled() const {
+    return UnalignedScratchAccess && UnalignedAccessMode;
+  }
+
   bool hasUnalignedAccessMode() const {
     return UnalignedAccessMode;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index bbdc006b9afcf0..44ca5c5ac610b3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1824,26 +1824,17 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
            Subtarget->hasUnalignedDSAccessEnabled();
   }
 
-  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
-    bool AlignedBy4 = Alignment >= Align(4);
-    if (IsFast)
-      *IsFast = AlignedBy4;
-
-    return AlignedBy4 ||
-           Subtarget->enableFlatScratch() ||
-           Subtarget->hasUnalignedScratchAccess();
-  }
-
   // FIXME: We have to be conservative here and assume that flat operations
   // will access scratch.  If we had access to the IR function, then we
   // could determine if any private memory was used in the function.
-  if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
-      !Subtarget->hasUnalignedScratchAccess()) {
+  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
+      AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
     bool AlignedBy4 = Alignment >= Align(4);
     if (IsFast)
       *IsFast = AlignedBy4;
 
-    return AlignedBy4;
+    return AlignedBy4 || Subtarget->enableFlatScratch() ||
+           Subtarget->hasUnalignedScratchAccessEnabled();
   }
 
   // So long as they are correct, wide global memory operations perform better
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
index 3b166660a84b75..9bbc29107b9df6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
@@ -483,40 +483,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_s16_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX12-LABEL: name: test_load_flat_s16_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s16_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -535,27 +517,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s16_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s16) = G_LOAD %0 :: (load (s16), align 1, addrspace 0)
     %2:_(s32) = G_ANYEXT %1
@@ -664,40 +634,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_s32_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX12-LABEL: name: test_load_flat_s32_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -716,27 +668,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s32_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_LOAD %0 :: (load (s32), align 2, addrspace 0)
     $vgpr0 = COPY %1
@@ -798,70 +738,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_s32_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX12-LABEL: name: test_load_flat_s32_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -890,47 +782,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s32_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_LOAD %0 :: (load (s32), align 1, addrspace 0)
     $vgpr0 = COPY %1
@@ -1247,76 +1107,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_s64_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; GFX12-LABEL: name: test_load_flat_s64_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -1347,51 +1153,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s64_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 2, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -1485,130 +1255,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_s64_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; GFX12-LABEL: name: test_load_flat_s64_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -1657,87 +1319,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s64_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -2075,87 +1665,24 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
-    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_s96_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
-    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; GFX12-LABEL: name: test_load_flat_s96_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align2
@@ -2191,58 +1718,16 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s96_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 2, addrspace 0)
@@ -2369,165 +1854,24 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
-    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_s96_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
-    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; GFX12-LABEL: name: test_load_flat_s96_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align1
@@ -2589,110 +1933,16 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s96_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 0)
@@ -3334,210 +2584,24 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_s128_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; GFX12-LABEL: name: test_load_flat_s128_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s128_align1
@@ -3614,140 +2678,16 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s128_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; UNALIGNED_GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 0)
@@ -4132,133 +3072,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_p1_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     ;
     ; GFX12-LABEL: name: test_load_flat_p1_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p1_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -4308,89 +3137,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p1_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p1) = G_LOAD %0 :: (load (p1), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -4662,79 +3417,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_p4_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     ;
     ; GFX12-LABEL: name: test_load_flat_p4_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -4766,53 +3464,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p4_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p4) = G_LOAD %0 :: (load (p4), align 2, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -4906,133 +3566,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_p4_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     ;
     ; GFX12-LABEL: name: test_load_flat_p4_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -5082,89 +3631,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p4_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p4) = G_LOAD %0 :: (load (p4), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -5274,43 +3749,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_p5_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; GFX12-LABEL: name: test_load_flat_p5_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
-    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -5330,29 +3784,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p5_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p5) = G_LOAD %0 :: (load (p5), align 2, addrspace 0)
     $vgpr0 = COPY %1
@@ -5416,73 +3856,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_p5_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; GFX12-LABEL: name: test_load_flat_p5_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
-    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -5512,49 +3901,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p5_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p5) = G_LOAD %0 :: (load (p5), align 1, addrspace 0)
     $vgpr0 = COPY %1
@@ -5732,40 +4087,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v2s8_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX12-LABEL: name: test_load_flat_v2s8_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s8_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -5784,27 +4121,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s8_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s8>) = G_LOAD %0 :: (load (<2 x s8>), align 1, addrspace 0)
     %2:_(s16) = G_BITCAST %1
@@ -6158,121 +4483,106 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
     ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ; GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR3]](s32)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v3s8_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
     ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ; GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR3]](s32)
     ;
     ; GFX12-LABEL: name: test_load_flat_v3s8_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
-    ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
     ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s8_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -6318,81 +4628,71 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; UNALIGNED_GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; UNALIGNED_GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
     ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR3]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v3s8_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
     ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s8>) = G_LOAD %0 :: (load (<3 x s8>), align 1, addrspace 0)
     %2:_(s24) = G_BITCAST %1
@@ -6503,40 +4803,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX12-LABEL: name: test_load_flat_v4s8_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -6555,27 +4837,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s8_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s8>) = G_LOAD %0 :: (load (<4 x s8>), align 2, addrspace 0)
     %2:_(s32) = G_BITCAST %1
@@ -6638,70 +4908,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX12-LABEL: name: test_load_flat_v4s8_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -6730,47 +4952,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s8_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s8>) = G_LOAD %0 :: (load (<4 x s8>), align 1, addrspace 0)
     %2:_(s32) = G_BITCAST %1
@@ -7185,40 +5375,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v2s16_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -7237,27 +5409,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s16_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 2, addrspace 0)
     $vgpr0 = COPY %1
@@ -7327,70 +5487,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v2s16_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -7419,47 +5531,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s16_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 1, addrspace 0)
     $vgpr0 = COPY %1
@@ -8291,36 +6371,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1)
+    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1)
+    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
     ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
     ; GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -8334,36 +6400,22 @@ body: |
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1)
+    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1)
+    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
     ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
     ; GFX11PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; GFX11PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -8377,36 +6429,22 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1)
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1)
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
     ; GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -8463,36 +6501,22 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
     ; UNALIGNED_GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; UNALIGNED_GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
     ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -8506,36 +6530,22 @@ body: |
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
     ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
     ; UNALIGNED_GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; UNALIGNED_GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -8765,70 +6775,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v4s16_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -8857,47 +6819,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s16_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 2, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -9005,124 +6935,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v4s16_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -9169,83 +6997,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s16_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -10686,133 +8446,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
-    ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align2
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
-    ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v2s64_align2
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -10862,89 +8511,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s64_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
-    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 2, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -11100,235 +8675,22 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v2s64_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -11412,157 +8774,15 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s64_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; UNALIGNED_GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 1, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -12078,342 +9298,42 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1)
+    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
     ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
+    ; GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v3s64_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1)
+    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
     ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
+    ; GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v3s64_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1)
+    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
+    ; GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s64_align1
@@ -12534,228 +9454,28 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
     ; UNALIGNED_GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; UNALIGNED_GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v3s64_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; UNALIGNED_GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; UNALIGNED_GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; UNALIGNED_GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; UNALIGNED_GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; UNALIGNED_GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; UNALIGNED_GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; UNALIGNED_GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1)
+    ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
     ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s64>) = G_LOAD %0 :: (load (<3 x s64>), align 1, addrspace 0)
@@ -13306,441 +10026,33 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
-    ; GFX9PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
-    ; GFX9PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
-    ; GFX9PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
-    ; GFX9PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
-    ; GFX9PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
-    ; GFX9PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
-    ; GFX9PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
-    ; GFX9PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
-    ; GFX9PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
-    ; GFX9PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
-    ; GFX9PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
-    ; GFX9PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
-    ; GFX9PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
-    ; GFX9PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
-    ; GFX9PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
-    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
+    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1)
+    ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v4s64_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
-    ; GFX11PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
-    ; GFX11PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
-    ; GFX11PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
-    ; GFX11PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
-    ; GFX11PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
-    ; GFX11PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
-    ; GFX11PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
-    ; GFX11PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
-    ; GFX11PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
-    ; GFX11PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
-    ; GFX11PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
-    ; GFX11PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
-    ; GFX11PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
-    ; GFX11PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX11PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
-    ; GFX11PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
-    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
+    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1)
+    ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v4s64_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
-    ; GFX12-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
-    ; GFX12-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
-    ; GFX12-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
-    ; GFX12-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
-    ; GFX12-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
-    ; GFX12-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
-    ; GFX12-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
-    ; GFX12-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
-    ; GFX12-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
-    ; GFX12-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
-    ; GFX12-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
-    ; GFX12-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
-    ; GFX12-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
-    ; GFX12-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
-    ; GFX12-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
-    ; GFX12-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
-    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
-    ; GFX12-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
-    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
-    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s64_align1
@@ -13894,294 +10206,22 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s64_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; UNALIGNED_GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; UNALIGNED_GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; UNALIGNED_GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; UNALIGNED_GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; UNALIGNED_GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; UNALIGNED_GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
-    ; UNALIGNED_GFX12-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
-    ; UNALIGNED_GFX12-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
-    ; UNALIGNED_GFX12-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
-    ; UNALIGNED_GFX12-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
-    ; UNALIGNED_GFX12-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>), align 1, addrspace 0)
@@ -14762,210 +10802,24 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v2p1_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v2p1_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p1_align1
@@ -15042,140 +10896,16 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2p1_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; UNALIGNED_GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 1, addrspace 0)
@@ -15436,120 +11166,24 @@ body: |
     ; GFX9PLUS: liveins: $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
+    ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 1)
+    ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
     ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
     ;
     ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align1
     ; GFX11PLUS: liveins: $vgpr0_vgpr1
     ; GFX11PLUS-NEXT: {{  $}}
     ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
+    ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 1)
+    ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
     ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
     ;
     ; GFX12-LABEL: name: test_load_flat_v2p3_align1
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 1)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p3_align1
@@ -15596,80 +11230,16 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
     ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2p3_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 1)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 1, addrspace 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
index 472cbe559e56f4..6ea0515924bf67 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
@@ -636,27 +636,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX10-LABEL: name: test_load_private_s16_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11-LABEL: name: test_load_private_s16_align1
     ; GFX11: liveins: $vgpr0
@@ -853,27 +841,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX10-LABEL: name: test_load_private_s32_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11-LABEL: name: test_load_private_s32_align2
     ; GFX11: liveins: $vgpr0
@@ -1012,47 +988,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX10-LABEL: name: test_load_private_s32_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; GFX11-LABEL: name: test_load_private_s32_align1
     ; GFX11: liveins: $vgpr0
@@ -1529,39 +1473,27 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; GFX10-LABEL: name: test_load_private_s24_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX10-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; GFX11-LABEL: name: test_load_private_s24_align1
     ; GFX11: liveins: $vgpr0
@@ -2147,42 +2079,22 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ;
     ; GFX10-LABEL: name: test_load_private_s64_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ;
     ; GFX11-LABEL: name: test_load_private_s64_align2
@@ -2386,78 +2298,22 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ;
     ; GFX10-LABEL: name: test_load_private_s64_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
     ;
     ; GFX11-LABEL: name: test_load_private_s64_align1
@@ -2742,53 +2598,14 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
@@ -2796,53 +2613,14 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
@@ -3381,28 +3159,14 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
@@ -3410,28 +3174,14 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
@@ -3701,53 +3451,14 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
@@ -3755,53 +3466,14 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
@@ -4166,68 +3838,17 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
@@ -4235,68 +3856,17 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
@@ -4928,35 +4498,17 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
@@ -4964,35 +4516,17 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
@@ -5321,68 +4855,17 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
@@ -5390,68 +4873,17 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
@@ -5932,42 +5364,22 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
     ;
     ; GFX10-LABEL: name: test_load_private_p1_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
     ;
     ; GFX11-LABEL: name: test_load_private_p1_align2
@@ -6171,78 +5583,22 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
     ;
     ; GFX10-LABEL: name: test_load_private_p1_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
     ;
     ; GFX11-LABEL: name: test_load_private_p1_align1
@@ -6494,29 +5850,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
-    ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3)
     ;
     ; GFX10-LABEL: name: test_load_private_p3_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
-    ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3)
     ;
     ; GFX11-LABEL: name: test_load_private_p3_align2
     ; GFX11: liveins: $vgpr0
@@ -6660,49 +6002,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
-    ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3)
     ;
     ; GFX10-LABEL: name: test_load_private_p3_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
-    ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3)
     ;
     ; GFX11-LABEL: name: test_load_private_p3_align1
     ; GFX11: liveins: $vgpr0
@@ -6923,29 +6231,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
-    ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; GFX10-LABEL: name: test_load_private_p5_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
-    ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; GFX11-LABEL: name: test_load_private_p5_align2
     ; GFX11: liveins: $vgpr0
@@ -7089,49 +6383,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
-    ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; GFX10-LABEL: name: test_load_private_p5_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
-    ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; GFX11-LABEL: name: test_load_private_p5_align1
     ; GFX11: liveins: $vgpr0
@@ -7357,30 +6617,20 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; GFX10-LABEL: name: test_load_private_v2s8_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; GFX11-LABEL: name: test_load_private_v2s8_align1
@@ -7938,81 +7188,71 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
-    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
     ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
-    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; GFX9-NEXT: $vgpr0 = COPY [[OR3]](s32)
     ;
     ; GFX10-LABEL: name: test_load_private_v3s8_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
-    ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
     ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX10-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
-    ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; GFX10-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ; GFX10-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; GFX10-NEXT: $vgpr0 = COPY [[OR3]](s32)
     ;
     ; GFX11-LABEL: name: test_load_private_v3s8_align1
     ; GFX11: liveins: $vgpr0
@@ -8658,136 +7898,34 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; GFX10-LABEL: name: test_load_private_v16s8_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; GFX11-LABEL: name: test_load_private_v16s8_align16
@@ -9107,27 +8245,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; GFX10-LABEL: name: test_load_private_v2s16_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; GFX11-LABEL: name: test_load_private_v2s16_align2
     ; GFX11: liveins: $vgpr0
@@ -9278,47 +8404,15 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; GFX10-LABEL: name: test_load_private_v2s16_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; GFX11-LABEL: name: test_load_private_v2s16_align1
     ; GFX11: liveins: $vgpr0
@@ -9824,27 +8918,26 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
-    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16)
     ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
@@ -9853,27 +8946,26 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>)
+    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
-    ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
+    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16)
     ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
@@ -10215,41 +9307,26 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
-    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16)
     ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
@@ -10258,41 +9335,26 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>)
+    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32)
+    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
-    ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
+    ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
+    ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
+    ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16)
+    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16)
     ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
@@ -10827,44 +9889,22 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; GFX10-LABEL: name: test_load_private_v4s16_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; GFX11-LABEL: name: test_load_private_v4s16_align2
@@ -11091,80 +10131,22 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; GFX10-LABEL: name: test_load_private_v4s16_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
-    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; GFX11-LABEL: name: test_load_private_v4s16_align1
@@ -11582,42 +10564,22 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; GFX10-LABEL: name: test_load_private_v2s32_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; GFX11-LABEL: name: test_load_private_v2s32_align2
@@ -11821,78 +10783,22 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; GFX10-LABEL: name: test_load_private_v2s32_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; GFX11-LABEL: name: test_load_private_v2s32_align1
@@ -12174,106 +11080,28 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     ;
     ; GFX10-LABEL: name: test_load_private_v3s32_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     ;
     ; GFX11-LABEL: name: test_load_private_v3s32_align16
@@ -12764,136 +11592,34 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; GFX10-LABEL: name: test_load_private_v4s32_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; GFX11-LABEL: name: test_load_private_v4s32_align16
@@ -13493,70 +12219,34 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; GFX10-LABEL: name: test_load_private_v4s32_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; GFX11-LABEL: name: test_load_private_v4s32_align2
@@ -13875,136 +12565,34 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; GFX10-LABEL: name: test_load_private_v4s32_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; GFX11-LABEL: name: test_load_private_v4s32_align1
@@ -15262,68 +13850,17 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
     ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
@@ -15331,68 +13868,17 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
     ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
@@ -18189,98 +16675,23 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX9-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5)
-    ; GFX9-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]]
-    ; GFX9-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5)
-    ; GFX9-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]]
-    ; GFX9-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]]
-    ; GFX9-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5)
-    ; GFX9-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]]
-    ; GFX9-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5)
-    ; GFX9-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]]
-    ; GFX9-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32)
-    ; GFX9-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]]
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32)
+    ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 1, addrspace 5)
+    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 1, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32)
     ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
@@ -18291,98 +16702,23 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5)
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX10-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5)
-    ; GFX10-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]]
-    ; GFX10-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5)
-    ; GFX10-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]]
-    ; GFX10-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]]
-    ; GFX10-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5)
-    ; GFX10-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]]
-    ; GFX10-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5)
-    ; GFX10-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]]
-    ; GFX10-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32)
-    ; GFX10-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]]
-    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32)
+    ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 1, addrspace 5)
+    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 1, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32)
     ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
@@ -18829,49 +17165,23 @@ body: |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
-    ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5)
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]]
-    ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32)
-    ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5)
-    ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5)
-    ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]]
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5)
+    ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32)
+    ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 2, addrspace 5)
+    ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32)
+    ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 2, addrspace 5)
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32)
     ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
@@ -18882,49 +17192,23 @@ body: |
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5)
+    ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
-    ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5)
-    ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]]
-    ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32)
-    ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5)
-    ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5)
-    ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]]
-    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5)
+    ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32)
+    ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 2, addrspace 5)
+    ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32)
+    ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 2, addrspace 5)
+    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32)
     ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index ea10547da6ab7f..fd2d0bc901f3fc 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -703,12 +703,12 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
 ; FLATSCR-LABEL: chain_hi_to_lo_private_other_dep:
 ; FLATSCR:       ; %bb.0: ; %bb
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, v0, off
+; FLATSCR-NEXT:    scratch_load_dword v0, v0, off
+; FLATSCR-NEXT:    s_mov_b32 s0, 0x7060302
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; FLATSCR-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
-; FLATSCR-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
-; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
+; FLATSCR-NEXT:    v_perm_b32 v0, v1, v0, s0
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_other_dep:
@@ -725,23 +725,22 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
 ; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_other_dep:
 ; FLATSCR_GFX10:       ; %bb.0: ; %bb
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v1, v0, off
+; FLATSCR_GFX10-NEXT:    scratch_load_dword v0, v0, off
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; FLATSCR_GFX10-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
-; FLATSCR_GFX10-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
-; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v0, v1
+; FLATSCR_GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; FLATSCR_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_private_other_dep:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, v0, off
+; GFX11-NEXT:    scratch_load_b32 v0, v0, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
-; GFX11-NEXT:    scratch_load_d16_b16 v1, v0, off offset:2
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, v1
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds i16, ptr addrspace(5) %ptr, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
index 0ad53083d0ff3f..12593e3760fd3e 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -123,10 +123,8 @@ define amdgpu_kernel void @zextload_flat_i16(ptr addrspace(1) noalias %out, ptr
 }
 
 ; GCN-LABEL: flat_scratch_unaligned_load:
-; GCN: flat_load_{{ubyte|u8}}
-; GCN: flat_load_{{ubyte|u8}}
-; GCN: flat_load_{{ubyte|u8}}
-; GCN: flat_load_{{ubyte|u8}}
+; GFX9: flat_load_dword
+; GFX10PLUS: flat_load_{{dword|b32}}
 define amdgpu_kernel void @flat_scratch_unaligned_load() {
   %scratch = alloca i32, addrspace(5)
   %fptr = addrspacecast ptr addrspace(5) %scratch to ptr
@@ -136,10 +134,8 @@ define amdgpu_kernel void @flat_scratch_unaligned_load() {
 }
 
 ; GCN-LABEL: flat_scratch_unaligned_store:
-; GCN: flat_store_{{byte|b8}}
-; GCN: flat_store_{{byte|b8}}
-; GCN: flat_store_{{byte|b8}}
-; GCN: flat_store_{{byte|b8}}
+; GFX9: flat_store_dword
+; GFX10PLUS: flat_store_{{dword|b32}}
 define amdgpu_kernel void @flat_scratch_unaligned_store() {
   %scratch = alloca i32, addrspace(5)
   %fptr = addrspacecast ptr addrspace(5) %scratch to ptr
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
index 1dd18b4228fe5e..9d43efbdf07b1f 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll
@@ -16,47 +16,18 @@ define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_mov_b32_e32 v9, s7
 ; CHECK-NEXT:    v_mov_b32_e32 v8, s6
-; CHECK-NEXT:    flat_load_ubyte v10, v[8:9] offset:5
-; CHECK-NEXT:    flat_load_ubyte v11, v[8:9] offset:6
-; CHECK-NEXT:    flat_load_ubyte v12, v[8:9] offset:7
-; CHECK-NEXT:    flat_load_ubyte v13, v[8:9] offset:3
-; CHECK-NEXT:    flat_load_ubyte v14, v[8:9] offset:2
-; CHECK-NEXT:    flat_load_ubyte v15, v[8:9] offset:1
-; CHECK-NEXT:    flat_load_ubyte v16, v[8:9]
-; CHECK-NEXT:    flat_load_ubyte v17, v[8:9] offset:4
-; CHECK-NEXT:    flat_load_ubyte v18, v[8:9] offset:13
-; CHECK-NEXT:    flat_load_ubyte v19, v[8:9] offset:14
-; CHECK-NEXT:    flat_load_ubyte v20, v[8:9] offset:15
-; CHECK-NEXT:    flat_load_ubyte v21, v[8:9] offset:11
-; CHECK-NEXT:    flat_load_ubyte v22, v[8:9] offset:10
-; CHECK-NEXT:    flat_load_ubyte v23, v[8:9] offset:9
-; CHECK-NEXT:    flat_load_ubyte v24, v[8:9] offset:8
-; CHECK-NEXT:    flat_load_ubyte v25, v[8:9] offset:12
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
 ; CHECK-NEXT:    s_add_u32 s4, s4, 1
 ; CHECK-NEXT:    s_addc_u32 s5, s5, 0
-; CHECK-NEXT:    v_add_co_u32_e32 v8, vcc, s6, v6
+; CHECK-NEXT:    v_mov_b32_e32 v13, s7
+; CHECK-NEXT:    v_add_co_u32_e32 v12, vcc, s6, v6
 ; CHECK-NEXT:    v_cmp_ge_u64_e64 s[8:9], s[4:5], 2
-; CHECK-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v7, vcc
+; CHECK-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v7, vcc
 ; CHECK-NEXT:    s_add_u32 s6, s6, 16
 ; CHECK-NEXT:    s_addc_u32 s7, s7, 0
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[8:9]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[8:9], v13 offset:3
-; CHECK-NEXT:    flat_store_byte v[8:9], v14 offset:2
-; CHECK-NEXT:    flat_store_byte v[8:9], v15 offset:1
-; CHECK-NEXT:    flat_store_byte v[8:9], v16
-; CHECK-NEXT:    flat_store_byte v[8:9], v12 offset:7
-; CHECK-NEXT:    flat_store_byte v[8:9], v11 offset:6
-; CHECK-NEXT:    flat_store_byte v[8:9], v10 offset:5
-; CHECK-NEXT:    flat_store_byte v[8:9], v17 offset:4
-; CHECK-NEXT:    flat_store_byte v[8:9], v21 offset:11
-; CHECK-NEXT:    flat_store_byte v[8:9], v22 offset:10
-; CHECK-NEXT:    flat_store_byte v[8:9], v23 offset:9
-; CHECK-NEXT:    flat_store_byte v[8:9], v24 offset:8
-; CHECK-NEXT:    flat_store_byte v[8:9], v20 offset:15
-; CHECK-NEXT:    flat_store_byte v[8:9], v19 offset:14
-; CHECK-NEXT:    flat_store_byte v[8:9], v18 offset:13
-; CHECK-NEXT:    flat_store_byte v[8:9], v25 offset:12
+; CHECK-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_2
 ; CHECK-NEXT:  ; %bb.3: ; %loop-memcpy-residual-header
 ; CHECK-NEXT:    s_mov_b32 s4, 0
@@ -128,47 +99,18 @@ define void @issue63986(i64 %0, i64 %idxprom) {
 ; CHECK-NEXT:    ; => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    v_mov_b32_e32 v10, s10
 ; CHECK-NEXT:    v_mov_b32_e32 v11, s11
-; CHECK-NEXT:    flat_load_ubyte v12, v[10:11] offset:5
-; CHECK-NEXT:    flat_load_ubyte v13, v[10:11] offset:6
-; CHECK-NEXT:    flat_load_ubyte v14, v[10:11] offset:7
-; CHECK-NEXT:    flat_load_ubyte v15, v[10:11] offset:3
-; CHECK-NEXT:    flat_load_ubyte v16, v[10:11] offset:2
-; CHECK-NEXT:    flat_load_ubyte v17, v[10:11] offset:1
-; CHECK-NEXT:    flat_load_ubyte v18, v[10:11]
-; CHECK-NEXT:    flat_load_ubyte v19, v[10:11] offset:4
-; CHECK-NEXT:    flat_load_ubyte v20, v[10:11] offset:13
-; CHECK-NEXT:    flat_load_ubyte v21, v[10:11] offset:14
-; CHECK-NEXT:    flat_load_ubyte v22, v[10:11] offset:15
-; CHECK-NEXT:    flat_load_ubyte v23, v[10:11] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[10:11] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[10:11] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[10:11] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[10:11] offset:12
+; CHECK-NEXT:    flat_load_dwordx4 v[10:13], v[10:11]
+; CHECK-NEXT:    v_mov_b32_e32 v15, s11
 ; CHECK-NEXT:    s_add_u32 s14, s14, 1
-; CHECK-NEXT:    v_add_co_u32_e32 v10, vcc, s10, v2
-; CHECK-NEXT:    v_addc_co_u32_e32 v11, vcc, v11, v3, vcc
+; CHECK-NEXT:    v_add_co_u32_e32 v14, vcc, s10, v2
+; CHECK-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v3, vcc
 ; CHECK-NEXT:    s_addc_u32 s15, s15, 0
 ; CHECK-NEXT:    s_add_u32 s10, s10, 16
 ; CHECK-NEXT:    v_cmp_ge_u64_e32 vcc, s[14:15], v[4:5]
 ; CHECK-NEXT:    s_addc_u32 s11, s11, 0
 ; CHECK-NEXT:    s_or_b64 s[12:13], vcc, s[12:13]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[10:11], v15 offset:3
-; CHECK-NEXT:    flat_store_byte v[10:11], v16 offset:2
-; CHECK-NEXT:    flat_store_byte v[10:11], v17 offset:1
-; CHECK-NEXT:    flat_store_byte v[10:11], v18
-; CHECK-NEXT:    flat_store_byte v[10:11], v14 offset:7
-; CHECK-NEXT:    flat_store_byte v[10:11], v13 offset:6
-; CHECK-NEXT:    flat_store_byte v[10:11], v12 offset:5
-; CHECK-NEXT:    flat_store_byte v[10:11], v19 offset:4
-; CHECK-NEXT:    flat_store_byte v[10:11], v23 offset:11
-; CHECK-NEXT:    flat_store_byte v[10:11], v24 offset:10
-; CHECK-NEXT:    flat_store_byte v[10:11], v25 offset:9
-; CHECK-NEXT:    flat_store_byte v[10:11], v26 offset:8
-; CHECK-NEXT:    flat_store_byte v[10:11], v22 offset:15
-; CHECK-NEXT:    flat_store_byte v[10:11], v21 offset:14
-; CHECK-NEXT:    flat_store_byte v[10:11], v20 offset:13
-; CHECK-NEXT:    flat_store_byte v[10:11], v27 offset:12
+; CHECK-NEXT:    flat_store_dwordx4 v[14:15], v[10:13]
 ; CHECK-NEXT:    s_andn2_b64 exec, exec, s[12:13]
 ; CHECK-NEXT:    s_cbranch_execnz .LBB0_14
 ; CHECK-NEXT:  .LBB0_15: ; %Flow20
@@ -251,23 +193,11 @@ define void @issue63986_reduced_expanded(i64 %idxprom) {
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:  .LBB1_8: ; %post-loop-memcpy-expansion
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, v2
+; CHECK-NEXT:    v_mov_b32_e32 v4, v2
+; CHECK-NEXT:    v_mov_b32_e32 v5, v2
 ; CHECK-NEXT:    s_and_b64 vcc, exec, 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:3
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:2
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:1
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:7
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:6
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:5
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:4
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:9
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:8
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:12
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:  .LBB1_9: ; %loop-memcpy-expansion2
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccz .LBB1_9
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 0a76e169e9c385..8c28fac0d839c2 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -10,108 +10,21 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1]
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:1
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:2
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:3
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:4
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:5
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:6
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:8
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:9
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:10
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:11
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:12
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:13
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:14
-; CHECK-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    v_mov_b32_e32 v12, s3
+; CHECK-NEXT:    v_mov_b32_e32 v11, s2
+; CHECK-NEXT:    flat_load_ubyte v13, v[11:12] offset:46
+; CHECK-NEXT:    flat_load_ushort v14, v[11:12] offset:44
+; CHECK-NEXT:    flat_load_dwordx3 v[8:10], v[11:12] offset:32
+; CHECK-NEXT:    flat_load_dwordx4 v[0:3], v[11:12] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[11:12]
+; CHECK-NEXT:    v_mov_b32_e32 v12, s1
+; CHECK-NEXT:    v_mov_b32_e32 v11, s0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:1
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:2
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:3
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:4
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:5
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:6
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:7
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:8
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:9
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:10
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:11
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:12
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:13
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:14
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:17
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[0:1] offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:30
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:29
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:28
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:27
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:26
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:25
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:24
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:23
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:22
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:21
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:20
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:19
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:18
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:17
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:16
-; CHECK-NEXT:    flat_store_byte v[2:3], v19 offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:46
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:45
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:44
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:43
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:42
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:41
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:40
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:39
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:38
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:37
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:36
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:35
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:34
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:33
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_load_ubyte v0, v[0:1] offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:46
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:45
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:44
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:43
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:42
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:41
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:40
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:39
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:38
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:37
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:36
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:35
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:34
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:33
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:32
-; CHECK-NEXT:    flat_store_byte v[2:3], v0 offset:31
+; CHECK-NEXT:    flat_store_byte v[11:12], v13 offset:46
+; CHECK-NEXT:    flat_store_short v[11:12], v14 offset:44
+; CHECK-NEXT:    flat_store_dwordx3 v[11:12], v[8:10] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[0:3] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -185,375 +98,59 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v24, 0
 ; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:15
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:14
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:13
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:12
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:11
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:10
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:9
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:8
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:7
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:6
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:5
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:4
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:3
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:2
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:1
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1]
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:31
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s2
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:10
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:23
+; CHECK-NEXT:    v_mov_b32_e32 v25, s2
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    buffer_store_dword v3, v25, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_store_dword v2, v25, s[16:19], 0 offen offset:120
+; CHECK-NEXT:    buffer_store_dword v1, v25, s[16:19], 0 offen offset:116
+; CHECK-NEXT:    buffer_store_dword v0, v25, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    buffer_store_dword v7, v25, s[16:19], 0 offen offset:108
+; CHECK-NEXT:    buffer_store_dword v6, v25, s[16:19], 0 offen offset:104
+; CHECK-NEXT:    buffer_store_dword v5, v25, s[16:19], 0 offen offset:100
+; CHECK-NEXT:    buffer_store_dword v4, v25, s[16:19], 0 offen offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1]
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    buffer_store_dword v11, v25, s[16:19], 0 offen offset:92
+; CHECK-NEXT:    buffer_store_dword v10, v25, s[16:19], 0 offen offset:88
+; CHECK-NEXT:    buffer_store_dword v9, v25, s[16:19], 0 offen offset:84
+; CHECK-NEXT:    buffer_store_dword v8, v25, s[16:19], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(16)
+; CHECK-NEXT:    buffer_store_dword v15, v25, s[16:19], 0 offen offset:76
+; CHECK-NEXT:    buffer_store_dword v14, v25, s[16:19], 0 offen offset:72
+; CHECK-NEXT:    buffer_store_dword v13, v25, s[16:19], 0 offen offset:68
+; CHECK-NEXT:    buffer_store_dword v12, v25, s[16:19], 0 offen offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:9
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:8
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:7
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:20
+; CHECK-NEXT:    buffer_store_dword v19, v25, s[16:19], 0 offen offset:60
+; CHECK-NEXT:    buffer_store_dword v18, v25, s[16:19], 0 offen offset:56
+; CHECK-NEXT:    buffer_store_dword v17, v25, s[16:19], 0 offen offset:52
+; CHECK-NEXT:    buffer_store_dword v16, v25, s[16:19], 0 offen offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:6
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:5
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:2
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:47
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:4
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:17
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:3
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:16
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:27
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:26
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:25
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:24
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:45
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:44
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:43
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:23
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:36
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:22
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:35
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:21
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:34
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:20
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:33
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:19
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:28
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:29
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:42
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:18
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:63
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:16
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:61
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:27
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:40
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:26
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:39
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:25
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:38
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:24
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:37
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:44
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:57
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:43
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:56
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:45
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:58
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:36
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:49
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:35
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:48
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:46
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:47
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:60
-; CHECK-NEXT:    s_waitcnt vmcnt(33)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:34
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:79
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:28
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:41
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:42
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:55
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:33
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:32
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:77
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:61
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:74
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:40
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:53
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:39
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:52
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:38
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:51
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:37
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:50
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:57
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:70
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:56
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:69
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:58
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:71
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:49
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:48
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:93
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:46
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:59
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:60
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:73
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:41
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:54
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:55
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:68
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:74
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:87
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:53
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:66
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:52
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:65
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:51
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:64
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:62
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:63
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:76
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:50
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:95
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:77
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:90
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:71
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:83
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:70
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:69
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:59
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:72
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:73
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:85
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:54
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:67
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:68
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:81
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:66
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:111
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:65
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:110
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:64
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:109
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:62
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:75
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:76
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:89
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:90
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:103
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:72
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:86
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:84
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:82
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:87
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:100
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:67
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:80
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:78
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:94
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:79
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:92
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:95
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:108
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:93
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:75
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:88
+; CHECK-NEXT:    buffer_store_dword v23, v25, s[16:19], 0 offen offset:44
+; CHECK-NEXT:    buffer_store_dword v22, v25, s[16:19], 0 offen offset:40
+; CHECK-NEXT:    buffer_store_dword v21, v25, s[16:19], 0 offen offset:36
+; CHECK-NEXT:    buffer_store_dword v20, v25, s[16:19], 0 offen offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    buffer_store_dword v3, v25, s[16:19], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v2, v25, s[16:19], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v1, v25, s[16:19], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v0, v25, s[16:19], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:89
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:102
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:78
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:91
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:94
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:107
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:92
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:105
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:88
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:101
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:91
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:104
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:86
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:85
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:84
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:83
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:82
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:96
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:97
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:98
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:99
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:120
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:81
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:80
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:111
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:110
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:109
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:108
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:100
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:121
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:122
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:123
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:124
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:125
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:126
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:107
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:127
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:105
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:103
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:102
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:101
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:116
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:117
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:119
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:114
-; CHECK-NEXT:    s_waitcnt vmcnt(34)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:104
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:118
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:115
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:113
-; CHECK-NEXT:    global_load_ubyte v21, v0, s[0:1] offset:112
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:99
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:98
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:97
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:96
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:127
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:126
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:125
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:124
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:123
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:122
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:121
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:120
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:119
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:118
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:117
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:116
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:115
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:114
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:113
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v21, v1, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    buffer_store_dword v7, v25, s[16:19], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v6, v25, s[16:19], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v25, s[16:19], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v4, v25, s[16:19], 0 offen
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -569,363 +166,57 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2
+; CHECK-NEXT:    v_mov_b32_e32 v26, s0
+; CHECK-NEXT:    buffer_load_dword v3, v26, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v2, v26, s[16:19], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v1, v26, s[16:19], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v0, v26, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v7, v26, s[16:19], 0 offen offset:108
+; CHECK-NEXT:    buffer_load_dword v6, v26, s[16:19], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:31
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:22
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:21
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:20
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:19
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:18
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:17
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:25
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:24
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:45
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:37
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:36
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:35
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:34
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:47
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:29
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:44
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:33
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:32
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:40
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:39
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:38
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:41
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:59
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:51
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:50
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:63
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:46
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:61
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:43
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:58
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:49
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:48
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:56
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:54
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:53
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:52
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:55
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:73
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:65
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:64
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:62
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:77
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:60
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:75
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:57
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:72
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:70
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:68
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:67
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:66
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:79
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:69
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:87
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:76
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:91
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:74
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:89
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:71
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:86
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:78
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:93
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:90
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:88
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:85
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:95
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:92
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:94
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:84
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:82
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:81
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:80
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:111
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:110
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:109
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:99
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v25, s1
+; CHECK-NEXT:    v_mov_b32_e32 v24, s0
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:112
+; CHECK-NEXT:    buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:107
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:105
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:104
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:103
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:106
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:102
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:101
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:100
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115
+; CHECK-NEXT:    buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:108
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:98
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:97
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:96
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:127
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:126
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:96
+; CHECK-NEXT:    buffer_load_dword v4, v26, s[16:19], 0 offen
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v5, v26, s[16:19], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[20:23] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[16:19] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[12:15] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:125
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:124
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:123
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:122
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:121
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:120
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:119
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:118
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:117
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:116
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:115
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:114
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:113
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:112
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
@@ -972,279 +263,27 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
 ; CHECK-LABEL: memcpy_p0_p3_minsize:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:112
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:113
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:114
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:115
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:116
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:117
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:118
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:119
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:112
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:113
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:114
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:115
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:116
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:117
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:118
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:119
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:120
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:121
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:122
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:123
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:124
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:125
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:126
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:127
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:120
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:121
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:122
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:123
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:124
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:125
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:126
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:127
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:96
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:97
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:98
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:99
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:100
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:101
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:102
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:103
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:96
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:97
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:98
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:99
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:100
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:101
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:102
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:103
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:104
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:105
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:106
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:107
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:108
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:109
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:110
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:111
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:104
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:105
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:106
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:107
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:108
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:109
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:110
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:111
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:80
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:81
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:82
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:83
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:84
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:85
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:86
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:87
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:80
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:81
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:82
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:84
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:85
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:86
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:87
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:88
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:89
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:90
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:91
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:92
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:93
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:94
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:95
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:88
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:89
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:90
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:91
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:92
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:93
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:94
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:95
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:64
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:65
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:66
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:67
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:68
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:69
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:70
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:71
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:64
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:65
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:66
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:67
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:68
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:69
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:70
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:71
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:72
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:73
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:74
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:75
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:76
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:77
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:78
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:79
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:72
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:73
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:74
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:75
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:76
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:77
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:78
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:79
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:48
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:49
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:50
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:51
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:52
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:53
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:54
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:55
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:48
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:49
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:50
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:51
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:52
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:53
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:54
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:55
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:56
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:57
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:58
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:59
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:60
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:61
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:62
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:63
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:56
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:57
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:58
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:59
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:60
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:61
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:62
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:63
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:32
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:33
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:34
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:35
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:36
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:37
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:38
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:39
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:32
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:33
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:34
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:35
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:36
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:37
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:38
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:39
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:40
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:41
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:42
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:43
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:44
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:45
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:46
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:47
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:40
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:41
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:42
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:43
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:44
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:45
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:46
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:47
-; CHECK-NEXT:    ds_read_u8 v3, v2
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v18, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:17
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:18
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v26, v2 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:23
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:29
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:31
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:27
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:31
-; CHECK-NEXT:    flat_store_byte v[0:1], v3
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:2
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:3
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:4
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:5
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:6
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:9
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT:    v_mov_b32_e32 v16, 0
+; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:4 offset1:5
+; CHECK-NEXT:    ds_read2_b64 v[12:15], v16 offset0:6 offset1:7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v21, s1
+; CHECK-NEXT:    v_mov_b32_e32 v20, s0
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[4:7] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:32
+; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
+; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
+; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
+; CHECK-NEXT:    ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[12:15] offset:48
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[4:7] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:96
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[16:19] offset:112
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
@@ -1256,108 +295,21 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1]
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:1
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:2
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:3
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:4
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:5
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:6
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:8
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:9
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:10
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:11
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:12
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:13
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:14
-; CHECK-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    v_mov_b32_e32 v12, s3
+; CHECK-NEXT:    v_mov_b32_e32 v11, s2
+; CHECK-NEXT:    flat_load_ubyte v13, v[11:12] offset:46
+; CHECK-NEXT:    flat_load_ushort v14, v[11:12] offset:44
+; CHECK-NEXT:    flat_load_dwordx3 v[8:10], v[11:12] offset:32
+; CHECK-NEXT:    flat_load_dwordx4 v[0:3], v[11:12] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[11:12]
+; CHECK-NEXT:    v_mov_b32_e32 v12, s1
+; CHECK-NEXT:    v_mov_b32_e32 v11, s0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:1
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:2
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:3
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:4
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:5
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:6
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:7
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:8
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:9
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:10
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:11
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:12
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:13
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:14
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:17
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[0:1] offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:30
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:29
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:28
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:27
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:26
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:25
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:24
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:23
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:22
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:21
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:20
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:19
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:18
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:17
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:16
-; CHECK-NEXT:    flat_store_byte v[2:3], v19 offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:46
-; CHECK-NEXT:    flat_load_ubyte v5, v[0:1] offset:45
-; CHECK-NEXT:    flat_load_ubyte v6, v[0:1] offset:44
-; CHECK-NEXT:    flat_load_ubyte v7, v[0:1] offset:43
-; CHECK-NEXT:    flat_load_ubyte v8, v[0:1] offset:42
-; CHECK-NEXT:    flat_load_ubyte v9, v[0:1] offset:41
-; CHECK-NEXT:    flat_load_ubyte v10, v[0:1] offset:40
-; CHECK-NEXT:    flat_load_ubyte v11, v[0:1] offset:39
-; CHECK-NEXT:    flat_load_ubyte v12, v[0:1] offset:38
-; CHECK-NEXT:    flat_load_ubyte v13, v[0:1] offset:37
-; CHECK-NEXT:    flat_load_ubyte v14, v[0:1] offset:36
-; CHECK-NEXT:    flat_load_ubyte v15, v[0:1] offset:35
-; CHECK-NEXT:    flat_load_ubyte v16, v[0:1] offset:34
-; CHECK-NEXT:    flat_load_ubyte v17, v[0:1] offset:33
-; CHECK-NEXT:    flat_load_ubyte v18, v[0:1] offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_load_ubyte v0, v[0:1] offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:46
-; CHECK-NEXT:    flat_store_byte v[2:3], v5 offset:45
-; CHECK-NEXT:    flat_store_byte v[2:3], v6 offset:44
-; CHECK-NEXT:    flat_store_byte v[2:3], v7 offset:43
-; CHECK-NEXT:    flat_store_byte v[2:3], v8 offset:42
-; CHECK-NEXT:    flat_store_byte v[2:3], v9 offset:41
-; CHECK-NEXT:    flat_store_byte v[2:3], v10 offset:40
-; CHECK-NEXT:    flat_store_byte v[2:3], v11 offset:39
-; CHECK-NEXT:    flat_store_byte v[2:3], v12 offset:38
-; CHECK-NEXT:    flat_store_byte v[2:3], v13 offset:37
-; CHECK-NEXT:    flat_store_byte v[2:3], v14 offset:36
-; CHECK-NEXT:    flat_store_byte v[2:3], v15 offset:35
-; CHECK-NEXT:    flat_store_byte v[2:3], v16 offset:34
-; CHECK-NEXT:    flat_store_byte v[2:3], v17 offset:33
-; CHECK-NEXT:    flat_store_byte v[2:3], v18 offset:32
-; CHECK-NEXT:    flat_store_byte v[2:3], v0 offset:31
+; CHECK-NEXT:    flat_store_byte v[11:12], v13 offset:46
+; CHECK-NEXT:    flat_store_short v[11:12], v14 offset:44
+; CHECK-NEXT:    flat_store_dwordx3 v[11:12], v[8:10] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[0:3] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -1431,375 +383,59 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v24, 0
 ; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:15
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:14
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:13
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:12
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:11
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:10
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:9
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:8
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:7
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:6
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:5
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:4
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:3
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:2
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:1
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1]
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:31
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s2
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:10
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:23
+; CHECK-NEXT:    v_mov_b32_e32 v25, s2
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    buffer_store_dword v3, v25, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_store_dword v2, v25, s[16:19], 0 offen offset:120
+; CHECK-NEXT:    buffer_store_dword v1, v25, s[16:19], 0 offen offset:116
+; CHECK-NEXT:    buffer_store_dword v0, v25, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(9)
+; CHECK-NEXT:    buffer_store_dword v7, v25, s[16:19], 0 offen offset:108
+; CHECK-NEXT:    buffer_store_dword v6, v25, s[16:19], 0 offen offset:104
+; CHECK-NEXT:    buffer_store_dword v5, v25, s[16:19], 0 offen offset:100
+; CHECK-NEXT:    buffer_store_dword v4, v25, s[16:19], 0 offen offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1]
+; CHECK-NEXT:    s_waitcnt vmcnt(13)
+; CHECK-NEXT:    buffer_store_dword v11, v25, s[16:19], 0 offen offset:92
+; CHECK-NEXT:    buffer_store_dword v10, v25, s[16:19], 0 offen offset:88
+; CHECK-NEXT:    buffer_store_dword v9, v25, s[16:19], 0 offen offset:84
+; CHECK-NEXT:    buffer_store_dword v8, v25, s[16:19], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(16)
+; CHECK-NEXT:    buffer_store_dword v15, v25, s[16:19], 0 offen offset:76
+; CHECK-NEXT:    buffer_store_dword v14, v25, s[16:19], 0 offen offset:72
+; CHECK-NEXT:    buffer_store_dword v13, v25, s[16:19], 0 offen offset:68
+; CHECK-NEXT:    buffer_store_dword v12, v25, s[16:19], 0 offen offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:9
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:8
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:7
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:20
+; CHECK-NEXT:    buffer_store_dword v19, v25, s[16:19], 0 offen offset:60
+; CHECK-NEXT:    buffer_store_dword v18, v25, s[16:19], 0 offen offset:56
+; CHECK-NEXT:    buffer_store_dword v17, v25, s[16:19], 0 offen offset:52
+; CHECK-NEXT:    buffer_store_dword v16, v25, s[16:19], 0 offen offset:48
 ; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:6
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:5
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:2
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:47
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:4
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:17
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:3
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:16
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:27
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:26
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:25
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:24
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:45
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:44
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:43
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:23
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:36
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:22
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:35
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:21
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:34
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:20
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:33
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:19
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:28
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:29
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:42
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:18
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:63
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:16
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:61
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:27
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:40
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:26
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:39
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:25
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:38
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:24
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:37
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:44
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:57
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:43
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:56
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:45
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:58
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:36
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:49
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:35
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:48
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:46
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:47
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:60
-; CHECK-NEXT:    s_waitcnt vmcnt(33)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:34
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:79
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:28
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:41
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:42
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:55
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:33
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:32
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:77
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:61
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:74
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:40
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:53
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:39
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:52
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:38
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:51
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:37
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:50
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:57
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:70
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:56
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:69
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:58
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:71
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:49
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:48
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:93
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:46
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:59
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:60
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:73
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:41
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:54
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:55
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:68
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:74
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:87
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:53
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:66
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:52
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:65
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:51
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:64
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:62
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:63
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:76
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:50
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:95
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:77
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:90
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:71
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:83
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:70
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:69
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:59
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:72
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:73
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:85
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:54
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:67
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:68
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:81
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:66
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:111
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:65
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:110
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:64
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:109
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:62
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:75
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:76
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:89
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:90
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:103
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:72
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:86
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:84
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:82
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:87
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:100
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:67
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:80
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:78
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:94
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:79
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:92
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:95
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:108
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:93
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:75
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:88
+; CHECK-NEXT:    buffer_store_dword v23, v25, s[16:19], 0 offen offset:44
+; CHECK-NEXT:    buffer_store_dword v22, v25, s[16:19], 0 offen offset:40
+; CHECK-NEXT:    buffer_store_dword v21, v25, s[16:19], 0 offen offset:36
+; CHECK-NEXT:    buffer_store_dword v20, v25, s[16:19], 0 offen offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    buffer_store_dword v3, v25, s[16:19], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v2, v25, s[16:19], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v1, v25, s[16:19], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v0, v25, s[16:19], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:89
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:102
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:78
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:91
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:94
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:107
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:92
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:105
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:88
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:101
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:91
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:104
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:86
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:85
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:84
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:83
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:82
-; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:96
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:97
-; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:98
-; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:99
-; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:120
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:81
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:80
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:111
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:110
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:109
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:108
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:100
-; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:121
-; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:122
-; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:123
-; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:124
-; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:125
-; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:126
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:107
-; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:127
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:105
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:103
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:102
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:101
-; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:116
-; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:117
-; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:119
-; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:114
-; CHECK-NEXT:    s_waitcnt vmcnt(34)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:104
-; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:118
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:115
-; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:113
-; CHECK-NEXT:    global_load_ubyte v21, v0, s[0:1] offset:112
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v18, v1, s[16:19], 0 offen offset:99
-; CHECK-NEXT:    buffer_store_byte v17, v1, s[16:19], 0 offen offset:98
-; CHECK-NEXT:    buffer_store_byte v16, v1, s[16:19], 0 offen offset:97
-; CHECK-NEXT:    buffer_store_byte v15, v1, s[16:19], 0 offen offset:96
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v7, v1, s[16:19], 0 offen offset:127
-; CHECK-NEXT:    buffer_store_byte v6, v1, s[16:19], 0 offen offset:126
-; CHECK-NEXT:    buffer_store_byte v5, v1, s[16:19], 0 offen offset:125
-; CHECK-NEXT:    buffer_store_byte v4, v1, s[16:19], 0 offen offset:124
-; CHECK-NEXT:    buffer_store_byte v3, v1, s[16:19], 0 offen offset:123
-; CHECK-NEXT:    buffer_store_byte v2, v1, s[16:19], 0 offen offset:122
-; CHECK-NEXT:    buffer_store_byte v20, v1, s[16:19], 0 offen offset:121
-; CHECK-NEXT:    buffer_store_byte v19, v1, s[16:19], 0 offen offset:120
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v11, v1, s[16:19], 0 offen offset:119
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v10, v1, s[16:19], 0 offen offset:118
-; CHECK-NEXT:    buffer_store_byte v9, v1, s[16:19], 0 offen offset:117
-; CHECK-NEXT:    buffer_store_byte v8, v1, s[16:19], 0 offen offset:116
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v13, v1, s[16:19], 0 offen offset:115
-; CHECK-NEXT:    buffer_store_byte v12, v1, s[16:19], 0 offen offset:114
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v1, s[16:19], 0 offen offset:113
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v21, v1, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    buffer_store_dword v7, v25, s[16:19], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v6, v25, s[16:19], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v25, s[16:19], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v4, v25, s[16:19], 0 offen
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -1815,363 +451,57 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v2, s0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2
+; CHECK-NEXT:    v_mov_b32_e32 v26, s0
+; CHECK-NEXT:    buffer_load_dword v3, v26, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v2, v26, s[16:19], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v1, v26, s[16:19], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v0, v26, s[16:19], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v7, v26, s[16:19], 0 offen offset:108
+; CHECK-NEXT:    buffer_load_dword v6, v26, s[16:19], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:31
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:22
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:21
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:20
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:19
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:18
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:17
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:25
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:24
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:45
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:37
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:36
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:35
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:34
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:47
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:29
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:44
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:33
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:32
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:40
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:39
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:38
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:41
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:59
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:51
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:50
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:63
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:46
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:61
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:43
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:58
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:49
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:48
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:56
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:54
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:53
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:52
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:55
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:73
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:65
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:64
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:62
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:77
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:60
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:75
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:57
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:72
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:70
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:68
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:67
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:66
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:79
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:69
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:87
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:76
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:91
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:74
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:89
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:71
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:86
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:78
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:93
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:90
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:88
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:85
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:95
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:92
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:94
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:84
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:82
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:81
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:80
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:111
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:110
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:109
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:99
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v25, s1
+; CHECK-NEXT:    v_mov_b32_e32 v24, s0
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:112
+; CHECK-NEXT:    buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:107
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:105
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:104
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:103
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:106
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:102
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:101
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:100
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115
+; CHECK-NEXT:    buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:108
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:98
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:97
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:96
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:127
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:126
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:96
+; CHECK-NEXT:    buffer_load_dword v4, v26, s[16:19], 0 offen
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v5, v26, s[16:19], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[20:23] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[16:19] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[12:15] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:125
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:124
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:123
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:122
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:121
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:120
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:119
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:118
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:117
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:116
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:115
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:114
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:113
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:112
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7]
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
@@ -2218,279 +548,27 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
 ; CHECK-LABEL: memcpy_p0_p3_optsize:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:112
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:113
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:114
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:115
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:116
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:117
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:118
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:119
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:112
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:113
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:114
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:115
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:116
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:117
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:118
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:119
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:120
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:121
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:122
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:123
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:124
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:125
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:126
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:127
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:120
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:121
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:122
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:123
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:124
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:125
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:126
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:127
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:96
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:97
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:98
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:99
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:100
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:101
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:102
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:103
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:96
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:97
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:98
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:99
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:100
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:101
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:102
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:103
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:104
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:105
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:106
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:107
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:108
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:109
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:110
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:111
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:104
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:105
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:106
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:107
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:108
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:109
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:110
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:111
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:80
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:81
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:82
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:83
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:84
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:85
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:86
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:87
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:80
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:81
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:82
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:83
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:84
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:85
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:86
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:87
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:88
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:89
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:90
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:91
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:92
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:93
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:94
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:95
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:88
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:89
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:90
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:91
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:92
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:93
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:94
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:95
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:64
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:65
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:66
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:67
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:68
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:69
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:70
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:71
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:64
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:65
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:66
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:67
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:68
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:69
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:70
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:71
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:72
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:73
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:74
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:75
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:76
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:77
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:78
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:79
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:72
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:73
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:74
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:75
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:76
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:77
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:78
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:79
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:48
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:49
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:50
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:51
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:52
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:53
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:54
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:55
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:48
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:49
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:50
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:51
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:52
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:53
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:54
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:55
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:56
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:57
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:58
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:59
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:60
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:61
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:62
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:63
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:56
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:57
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:58
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:59
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:60
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:61
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:62
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:63
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:32
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:33
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:34
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:35
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:36
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:37
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:38
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:39
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:32
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:33
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:34
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:35
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:36
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:37
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:38
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:39
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:40
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:41
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:42
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:43
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:44
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:45
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:46
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:47
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:40
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:41
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:42
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:43
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:44
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:45
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:46
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:47
-; CHECK-NEXT:    ds_read_u8 v3, v2
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v18, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:17
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:18
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v26, v2 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:23
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:29
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:31
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:27
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:31
-; CHECK-NEXT:    flat_store_byte v[0:1], v3
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:2
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:3
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:4
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:5
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:6
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:9
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT:    v_mov_b32_e32 v16, 0
+; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:4 offset1:5
+; CHECK-NEXT:    ds_read2_b64 v[12:15], v16 offset0:6 offset1:7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v21, s1
+; CHECK-NEXT:    v_mov_b32_e32 v20, s0
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3]
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[4:7] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:32
+; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
+; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
+; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
+; CHECK-NEXT:    ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[12:15] offset:48
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[4:7] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:96
+; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[16:19] offset:112
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
index 7575782c1b2acd..cadc3dadb0a1e9 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
@@ -13,55 +13,9 @@ define void @memcpy_p0_p0_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -73,101 +27,19 @@ define void @memcpy_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xe
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -179,104 +51,13 @@ define void @memcpy_p0_p0_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:31
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -288,31 +69,9 @@ define void @memcpy_p0_p0_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -324,55 +83,19 @@ define void @memcpy_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -384,55 +107,13 @@ define void @memcpy_p0_p0_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -458,58 +139,13 @@ define void @memcpy_p0_p0_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -553,58 +189,13 @@ define void @memcpy_p0_p0_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p0_p0_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(16)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -634,55 +225,9 @@ define void @memcpy_p0_p1_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -694,101 +239,19 @@ define void @memcpy_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xe
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -800,104 +263,13 @@ define void @memcpy_p0_p1_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:31
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -909,31 +281,9 @@ define void @memcpy_p0_p1_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:2
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -945,55 +295,19 @@ define void @memcpy_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1005,55 +319,13 @@ define void @memcpy_p0_p1_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1080,35 +352,12 @@ define void @memcpy_p0_p1_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x1
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v8 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v9 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v10 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v11 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v11
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 8, v11
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1153,35 +402,12 @@ define void @memcpy_p0_p1_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x1
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
-; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:15
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v8 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v9 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v10 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v11 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v11
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 8, v11
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1211,54 +437,9 @@ define void @memcpy_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1270,96 +451,19 @@ define void @memcpy_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v17, v2
-; CHECK-NEXT:    ds_read_u8 v18, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:17
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:29
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:23
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:29
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:27
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:25
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:21
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:19
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
+; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
+; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1371,100 +475,12 @@ define void @memcpy_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v18, v2
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:17
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:31
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:29
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:23
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:31
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1476,30 +492,9 @@ define void @memcpy_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v3, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1511,54 +506,19 @@ define void @memcpy_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:28
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:24
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:22
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:20
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:18
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:16
-; CHECK-NEXT:    ds_read_u16 v11, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v12, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v14, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v15, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v16, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v17, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
+; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
+; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1570,54 +530,12 @@ define void @memcpy_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v3, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:28
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:24
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:22
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:20
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:18
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:16
-; CHECK-NEXT:    ds_read_u16 v11, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v12, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v14, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v15, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v16, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v17, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1643,35 +561,12 @@ define void @memcpy_p0_p3_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset1:1
-; CHECK-NEXT:    ds_read_b128 v[7:10], v2 offset:15
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:15
+; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v7 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v8 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v9 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v10 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1714,35 +609,12 @@ define void @memcpy_p0_p3_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p0_p3_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b128 v[3:6], v2
-; CHECK-NEXT:    ds_read_b128 v[7:10], v2 offset:15
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:15
+; CHECK-NEXT:    ds_read_b128 v[7:10], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v7 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v8 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v9 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v10 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 8, v10
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1771,55 +643,12 @@ define void @memcpy_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1831,100 +660,24 @@ define void @memcpy_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:2
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:3
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:4
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:5
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:6
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:7
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:8
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:9
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:10
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:11
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:12
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:13
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
+; CHECK-NEXT:    global_load_dword v4, v[2:3], off offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dword v[0:1], v4 offset:24
+; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
+; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
+; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:30
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1936,104 +689,18 @@ define void @memcpy_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:31
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2045,30 +712,12 @@ define void @memcpy_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:2
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:4
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:6
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:8
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:10
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:10
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:12
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2 offset:14
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2080,55 +729,24 @@ define void @memcpy_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
+; CHECK-NEXT:    global_load_dword v4, v[2:3], off offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dword v[0:1], v4 offset:24
+; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
+; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:30
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2140,55 +758,18 @@ define void @memcpy_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
+; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2219,30 +800,7 @@ define void @memcpy_p0_p4_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v2 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v3 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v4 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v5 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2290,30 +848,7 @@ define void @memcpy_p0_p4_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v2 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:15
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v3 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:19
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v4 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:23
-; CHECK-NEXT:    flat_store_byte_d16_hi v[0:1], v5 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 24, v4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v4
-; CHECK-NEXT:    v_lshrrev_b32_e32 v9, 24, v5
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:16
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:24
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2342,55 +877,13 @@ define void @memcpy_p0_p5_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2402,99 +895,23 @@ define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x11
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17
-; CHECK-NEXT:    s_clause 0xc
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:24
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:19
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2506,103 +923,19 @@ define void @memcpy_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x11
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18
-; CHECK-NEXT:    s_clause 0xd
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2614,31 +947,13 @@ define void @memcpy_p0_p5_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:2
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2650,55 +965,23 @@ define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2710,55 +993,19 @@ define void @memcpy_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2788,53 +1035,19 @@ define void @memcpy_p0_p5_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:15
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:23
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:27
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2888,53 +1101,19 @@ define void @memcpy_p0_p5_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p0_p5_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:15
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:23
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:27
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2970,41 +1149,8 @@ define void @memcpy_p1_p0_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v10
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v11, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 8, v17
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v18, 8, v2
-; CHECK-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v7
-; CHECK-NEXT:    v_lshl_or_b32 v4, v10, 16, v9
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3016,79 +1162,15 @@ define void @memcpy_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v9
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v10, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v7, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v15, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v7, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v19, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v21, 8, v22
-; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v23, 8, v24
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v25, 8, v26
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v27, 8, v28
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v30
-; CHECK-NEXT:    v_lshl_or_b32 v4, v14, 16, v12
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[6:7], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v31, 8, v32
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v33, 8, v2
-; CHECK-NEXT:    v_lshl_or_b32 v3, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v2, v2, 16, v18
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3100,79 +1182,13 @@ define void @memcpy_p1_p0_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:31
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v34, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v10
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; CHECK-NEXT:    v_lshl_or_b32 v6, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v14, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v18, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v20, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v22, 8, v23
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v24, 8, v25
-; CHECK-NEXT:    v_lshl_or_b32 v9, v12, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v26, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v28, 8, v29
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v30, 8, v31
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v32, 8, v33
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v34, 8, v2
-; CHECK-NEXT:    v_lshl_or_b32 v2, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v6, v19, 16, v18
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -3183,23 +1199,8 @@ define void @memcpy_p1_p0_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3]
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v9, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v6
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v7
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -3211,52 +1212,16 @@ define void @memcpy_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ushort v19, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v20, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v21, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v22, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v23, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v4, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v17, 8, v18
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v19, 16, v20
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[6:7], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v21, 16, v22
-; CHECK-NEXT:    v_lshl_or_b32 v9, v13, 16, v12
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v23, 16, v2
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -3267,39 +1232,13 @@ define void @memcpy_p1_p0_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v19, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 16, v9
-; CHECK-NEXT:    v_lshl_or_b32 v9, v7, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v12, 16, v13
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v14, 16, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v16, 16, v17
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v18, 16, v19
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -3323,47 +1262,13 @@ define void @memcpy_p1_p0_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p0_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v7, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v9, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v7, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v17, 8, v18
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v19, 8, v20
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -3404,47 +1309,13 @@ define void @memcpy_p1_p0_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p1_p0_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v7, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v9, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v7, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v17, 8, v18
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v19, 8, v20
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -4042,44 +1913,13 @@ define void @memcpy_p1_p5_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v16, 8, v15
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v2, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -4090,81 +1930,21 @@ define void @memcpy_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v23, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v25
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v26, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v28, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v29
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v31, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v7, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v2, 8, v32
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v6, v14, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v8, v19, 16, v18
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[9:10], off offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -4175,81 +1955,19 @@ define void @memcpy_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v23, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v26, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v28, 8, v27
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v29
-; CHECK-NEXT:    v_lshl_or_b32 v7, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v32, 8, v31
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v2, 8, v33
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v6, v14, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v8, v19, 16, v18
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -4260,24 +1978,13 @@ define void @memcpy_p1_p5_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v7
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v10, 16, v9
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -4288,52 +1995,21 @@ define void @memcpy_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v19, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v20, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v21, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v22, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v14, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v16, 8, v15
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v18, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 16, v10
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v20, 16, v19
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v22, 16, v21
-; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 16, v11
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[9:10], off offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4344,41 +2020,19 @@ define void @memcpy_p1_p5_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 16, v15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v18, 16, v17
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -4406,49 +2060,18 @@ define void @memcpy_p1_p5_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p1_p5_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v10, v2, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v20, 8, v19
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 16, v8
-; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 16, v11
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4500,49 +2123,18 @@ define void @memcpy_p1_p5_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p1_p5_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v10, v2, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v20, 8, v19
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 16, v8
-; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 16, v11
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4577,41 +2169,8 @@ define void @memcpy_p3_p0_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v9
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 8, v16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 8, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
 ; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -4624,80 +2183,16 @@ define void @memcpy_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v8
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v20, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v22, 8, v23
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v24, 8, v25
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v26, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v28, 8, v29
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[7:8], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v30, 8, v31
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v32, 8, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v6, v10, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v11
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:23
-; CHECK-NEXT:    ds_write_b64 v0, v[3:4] offset:16
-; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4709,79 +2204,13 @@ define void @memcpy_p3_p0_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:31
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v33, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v9
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v17, 8, v18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v19, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v21, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v23, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v25, 8, v26
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v28
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v29, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v31, 8, v32
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v33, 8, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v6, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
-; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4793,23 +2222,8 @@ define void @memcpy_p3_p0_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2]
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v8, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v5
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v6
 ; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -4822,51 +2236,16 @@ define void @memcpy_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v18, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v19, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v20, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v21, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v22, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v7, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v10, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v11, v5, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v15, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v17, 16, v18
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[7:8], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v19, 16, v20
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v21, 16, v22
-; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:16
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:23
+; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4878,40 +2257,13 @@ define void @memcpy_p3_p0_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v18, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v13, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v18
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
-; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4937,47 +2289,13 @@ define void @memcpy_p3_p0_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p0_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v8, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v7, v6, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v14, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v16, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v18, 8, v19
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2]
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2] offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v11
-; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b128 v0, v[5:8] offset:15
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset1:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write_b128 v0, v[7:10] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5021,47 +2339,13 @@ define void @memcpy_p3_p0_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p3_p0_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v8, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v7, v6, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v14, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v16, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v18, 8, v19
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v11
-; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_write_b128 v0, v[1:4]
-; CHECK-NEXT:    ds_write_b128 v0, v[5:8] offset:15
+; CHECK-NEXT:    ds_write_b128 v0, v[3:6] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write_b128 v0, v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5701,44 +2985,13 @@ define void @memcpy_p3_p5_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v15, 8, v14
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v1, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5750,81 +3003,21 @@ define void @memcpy_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v15, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v20, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v18, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v23
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v26
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v28
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v1, 8, v31
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v7, v18, 16, v17
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
-; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:23
+; CHECK-NEXT:    ds_write_b64 v0, v[6:7] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b64 v0, v[8:9] offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5836,81 +3029,19 @@ define void @memcpy_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v15, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v20, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v18, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v26
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v28
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v31, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v1, 8, v32
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v7, v18, 16, v17
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5922,24 +3053,13 @@ define void @memcpy_p3_p5_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5951,52 +3071,21 @@ define void @memcpy_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v18, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v19, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v20, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v21, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v13, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v15, 8, v14
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v17, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v19, 16, v18
+; CHECK-NEXT:    ds_write_b64 v0, v[6:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v21, 16, v20
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 16, v10
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
-; CHECK-NEXT:    ds_write_b64 v0, v[7:8] offset:23
+; CHECK-NEXT:    ds_write_b64 v0, v[8:9] offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -6008,41 +3097,19 @@ define void @memcpy_p3_p5_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v17, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 16, v10
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
+; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -6072,50 +3139,19 @@ define void @memcpy_p3_p5_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p3_p5_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v9, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v4, v1, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v15, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v2, v11, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v18
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v1, v13, 16, v12
+; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset1:1
-; CHECK-NEXT:    ds_write_b128 v0, v[1:4] offset:15
+; CHECK-NEXT:    ds_write_b128 v0, v[2:5] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -6169,49 +3205,18 @@ define void @memcpy_p3_p5_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p3_p5_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v9, v1, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v15, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v18
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 16, v7
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ds_write_b128 v0, v[2:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    ds_write_b128 v0, v[6:9] offset:15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -6248,55 +3253,12 @@ define void @memcpy_p5_p0_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -6307,101 +3269,24 @@ define void @memcpy_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xe
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -6412,104 +3297,19 @@ define void @memcpy_p5_p0_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:31
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -6520,31 +3320,12 @@ define void @memcpy_p5_p0_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -6555,55 +3336,24 @@ define void @memcpy_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -6614,55 +3364,19 @@ define void @memcpy_p5_p0_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -6689,61 +3403,19 @@ define void @memcpy_p5_p0_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -6793,61 +3465,19 @@ define void @memcpy_p5_p0_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p5_p0_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x10
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:15
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -6881,55 +3511,12 @@ define void @memcpy_p5_p1_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -6940,207 +3527,47 @@ define void @memcpy_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
   ret void
-}
-
-define void @memcpy_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
-; CHECK-LABEL: memcpy_p5_p1_sz32_align_1_1:
-; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:31
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v33, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:18
+}
+
+define void @memcpy_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) {
+; CHECK-LABEL: memcpy_p5_p1_sz32_align_1_1:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -7151,31 +3578,12 @@ define void @memcpy_p5_p1_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -7186,55 +3594,24 @@ define void @memcpy_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -7245,55 +3622,19 @@ define void @memcpy_p5_p1_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -7329,30 +3670,10 @@ define void @memcpy_p5_p1_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v10
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -7411,30 +3732,10 @@ define void @memcpy_p5_p1_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v10
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -7468,54 +3769,12 @@ define void @memcpy_p5_p3_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:15
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:7
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -7526,85 +3785,25 @@ define void @memcpy_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:7
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v17, v1 offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:24
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:25
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u8 v18, v1 offset:27
-; CHECK-NEXT:    ds_read_u8 v19, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v20, v1 offset:29
-; CHECK-NEXT:    ds_read_u8 v21, v1 offset:30
-; CHECK-NEXT:    ds_read_u8 v22, v1 offset:16
-; CHECK-NEXT:    ds_read_u8 v23, v1 offset:17
-; CHECK-NEXT:    ds_read_u8 v24, v1 offset:18
-; CHECK-NEXT:    ds_read_u8 v25, v1 offset:19
-; CHECK-NEXT:    ds_read_u8 v26, v1 offset:20
-; CHECK-NEXT:    ds_read_u8 v27, v1 offset:21
-; CHECK-NEXT:    ds_read_u8 v28, v1 offset:22
-; CHECK-NEXT:    ds_read_u8 v1, v1 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
+; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
+; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -7615,79 +3814,18 @@ define void @memcpy_p5_p3_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:15
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v10, v1
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v17, v1 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:24
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:25
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:27
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v18, v1 offset:29
-; CHECK-NEXT:    ds_read_u8 v19, v1 offset:30
-; CHECK-NEXT:    ds_read_u8 v20, v1 offset:31
-; CHECK-NEXT:    ds_read_u8 v21, v1 offset:16
-; CHECK-NEXT:    ds_read_u8 v22, v1 offset:17
-; CHECK-NEXT:    ds_read_u8 v23, v1 offset:18
-; CHECK-NEXT:    ds_read_u8 v24, v1 offset:19
-; CHECK-NEXT:    ds_read_u8 v25, v1 offset:20
-; CHECK-NEXT:    ds_read_u8 v26, v1 offset:21
-; CHECK-NEXT:    ds_read_u8 v27, v1 offset:22
-; CHECK-NEXT:    ds_read_u8 v1, v1 offset:23
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[6:9], v1 offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -7698,30 +3836,12 @@ define void @memcpy_p5_p3_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v2, v1
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v1, v1 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -7732,54 +3852,25 @@ define void @memcpy_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:30
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:28
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:22
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:20
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:18
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:16
-; CHECK-NEXT:    ds_read_u16 v10, v1 offset:14
-; CHECK-NEXT:    ds_read_u16 v11, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v12, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v13, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v14, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v15, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v16, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
+; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
+; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -7790,54 +3881,18 @@ define void @memcpy_p5_p3_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v2, v1 offset:30
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:28
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:22
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:20
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:18
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:16
-; CHECK-NEXT:    ds_read_u16 v10, v1 offset:14
-; CHECK-NEXT:    ds_read_u16 v11, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v12, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v13, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v14, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v15, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v16, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[6:9], v1 offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -7872,30 +3927,10 @@ define void @memcpy_p5_p3_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v6
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v6
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -7952,30 +3987,10 @@ define void @memcpy_p5_p3_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v6
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v6
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v9
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -8008,55 +4023,12 @@ define void @memcpy_p5_p4_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -8067,100 +4039,24 @@ define void @memcpy_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -8171,103 +4067,19 @@ define void @memcpy_p5_p4_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:31
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v33, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:17
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -8278,31 +4090,12 @@ define void @memcpy_p5_p4_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -8313,55 +4106,24 @@ define void @memcpy_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -8372,55 +4134,19 @@ define void @memcpy_p5_p4_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -8456,30 +4182,10 @@ define void @memcpy_p5_p4_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v10
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -8538,30 +4244,10 @@ define void @memcpy_p5_p4_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 24, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v2, 8, v7
-; CHECK-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v4, 8, v8
-; CHECK-NEXT:    v_lshrrev_b32_e32 v5, 24, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v6, 8, v9
-; CHECK-NEXT:    v_lshrrev_b32_e32 v7, 24, v10
-; CHECK-NEXT:    v_lshrrev_b32_e32 v8, 8, v10
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false)
@@ -8595,55 +4281,19 @@ define void @memcpy_p5_p5_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -8654,99 +4304,34 @@ define void @memcpy_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x11
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_clause 0xc
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -8757,103 +4342,31 @@ define void @memcpy_p5_p5_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x11
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_clause 0xd
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -8864,31 +4377,19 @@ define void @memcpy_p5_p5_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -8899,55 +4400,34 @@ define void @memcpy_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -8958,55 +4438,31 @@ define void @memcpy_p5_p5_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -9040,67 +4496,31 @@ define void @memcpy_p5_p5_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz31_align_8_8:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_dword v16, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v17, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v18, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v19, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:19
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false)
@@ -9169,67 +4589,31 @@ define void @memcpy_p5_p5_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a
 ; CHECK-LABEL: memcpy_p5_p5_sz31_align_16_16:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x13
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_dword v16, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v17, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v18, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v19, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:19
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:29
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:15
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:19
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
index cc5256620bfe08..4e5688adcd6bbd 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
@@ -13,55 +13,9 @@ define void @memmove_p0_p0_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -73,100 +27,19 @@ define void @memmove_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(30) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -178,103 +51,13 @@ define void @memmove_p0_p0_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:31
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v34, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(31) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v34 offset:1
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -286,31 +69,9 @@ define void @memmove_p0_p0_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -322,55 +83,19 @@ define void @memmove_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -382,55 +107,13 @@ define void @memmove_p0_p0_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v2, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -559,55 +242,9 @@ define void @memmove_p0_p1_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -619,100 +256,19 @@ define void @memmove_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v20, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v21, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v22, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v23, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v24, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v25, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v26, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v27, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v28, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v29, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v30, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v31, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v32, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v33, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:3
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:2
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:1
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -724,103 +280,13 @@ define void @memmove_p0_p1_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:31
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v20, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v21, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v22, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v23, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v24, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v25, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v26, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v27, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v28, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v29, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v30, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v31, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v32, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v33, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v34, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v34 offset:1
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -832,31 +298,9 @@ define void @memmove_p0_p1_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:2
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -868,55 +312,19 @@ define void @memmove_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -928,55 +336,13 @@ define void @memmove_p0_p1_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p1_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1105,54 +471,9 @@ define void @memmove_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:7
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1164,72 +485,19 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:29
 ; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:17
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:18
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:23
-; CHECK-NEXT:    ds_read_u8 v18, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v26, v2
-; CHECK-NEXT:    ds_read_u8 v27, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v28, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v29, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v30, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v31, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v32, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
+; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
+; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:27
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:23
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:15
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:9
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:7
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:6
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:5
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:4
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:3
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:2
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:1
-; CHECK-NEXT:    flat_store_byte v[0:1], v26
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1241,74 +509,12 @@ define void @memmove_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v4, v2 offset:25
-; CHECK-NEXT:    ds_read_u8 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u8 v6, v2 offset:27
-; CHECK-NEXT:    ds_read_u8 v7, v2 offset:28
-; CHECK-NEXT:    ds_read_u8 v8, v2 offset:29
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT:    ds_read_u8 v10, v2 offset:31
-; CHECK-NEXT:    ds_read_u8 v11, v2 offset:16
-; CHECK-NEXT:    ds_read_u8 v12, v2 offset:17
-; CHECK-NEXT:    ds_read_u8 v13, v2 offset:18
-; CHECK-NEXT:    ds_read_u8 v14, v2 offset:19
-; CHECK-NEXT:    ds_read_u8 v15, v2 offset:20
-; CHECK-NEXT:    ds_read_u8 v16, v2 offset:21
-; CHECK-NEXT:    ds_read_u8 v17, v2 offset:22
-; CHECK-NEXT:    ds_read_u8 v18, v2 offset:23
-; CHECK-NEXT:    ds_read_u8 v19, v2 offset:8
-; CHECK-NEXT:    ds_read_u8 v20, v2 offset:9
-; CHECK-NEXT:    ds_read_u8 v21, v2 offset:10
-; CHECK-NEXT:    ds_read_u8 v22, v2 offset:11
-; CHECK-NEXT:    ds_read_u8 v23, v2 offset:12
-; CHECK-NEXT:    ds_read_u8 v24, v2 offset:13
-; CHECK-NEXT:    ds_read_u8 v25, v2 offset:14
-; CHECK-NEXT:    ds_read_u8 v26, v2 offset:15
-; CHECK-NEXT:    ds_read_u8 v27, v2
-; CHECK-NEXT:    ds_read_u8 v28, v2 offset:1
-; CHECK-NEXT:    ds_read_u8 v29, v2 offset:2
-; CHECK-NEXT:    ds_read_u8 v30, v2 offset:3
-; CHECK-NEXT:    ds_read_u8 v31, v2 offset:4
-; CHECK-NEXT:    ds_read_u8 v32, v2 offset:5
-; CHECK-NEXT:    ds_read_u8 v33, v2 offset:6
-; CHECK-NEXT:    ds_read_u8 v2, v2 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:31
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:29
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:27
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:26
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:25
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:23
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:22
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:21
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:19
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:18
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:17
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:15
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:14
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:13
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:12
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:11
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:10
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:9
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:7
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:6
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:5
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:4
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:3
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:2
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:1
-; CHECK-NEXT:    flat_store_byte v[0:1], v27
+; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1320,30 +526,9 @@ define void @memmove_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v3, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1355,54 +540,19 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v3, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:28
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:24
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:22
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:20
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:18
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:16
-; CHECK-NEXT:    ds_read_u16 v11, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v12, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v14, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v15, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v16, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v17, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
+; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
+; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
+; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1414,54 +564,12 @@ define void @memmove_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v3, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v4, v2 offset:28
-; CHECK-NEXT:    ds_read_u16 v5, v2 offset:26
-; CHECK-NEXT:    ds_read_u16 v6, v2 offset:24
-; CHECK-NEXT:    ds_read_u16 v7, v2 offset:22
-; CHECK-NEXT:    ds_read_u16 v8, v2 offset:20
-; CHECK-NEXT:    ds_read_u16 v9, v2 offset:18
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:16
-; CHECK-NEXT:    ds_read_u16 v11, v2 offset:14
-; CHECK-NEXT:    ds_read_u16 v12, v2 offset:12
-; CHECK-NEXT:    ds_read_u16 v13, v2 offset:10
-; CHECK-NEXT:    ds_read_u16 v14, v2 offset:8
-; CHECK-NEXT:    ds_read_u16 v15, v2 offset:6
-; CHECK-NEXT:    ds_read_u16 v16, v2 offset:4
-; CHECK-NEXT:    ds_read_u16 v17, v2 offset:2
-; CHECK-NEXT:    ds_read_u16 v2, v2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
+; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1583,55 +691,9 @@ define void @memmove_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1643,100 +705,19 @@ define void @memmove_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v20, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v21, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v22, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v23, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v24, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v25, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v26, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v27, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v28, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v29, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v30, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v31, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v32, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v33, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:3
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:2
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:1
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1748,103 +729,13 @@ define void @memmove_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:31
-; CHECK-NEXT:    global_load_ubyte v5, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ubyte v6, v[2:3], off offset:29
-; CHECK-NEXT:    global_load_ubyte v7, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ubyte v8, v[2:3], off offset:27
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ubyte v10, v[2:3], off offset:25
-; CHECK-NEXT:    global_load_ubyte v11, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ubyte v12, v[2:3], off offset:23
-; CHECK-NEXT:    global_load_ubyte v13, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ubyte v14, v[2:3], off offset:21
-; CHECK-NEXT:    global_load_ubyte v15, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ubyte v16, v[2:3], off offset:19
-; CHECK-NEXT:    global_load_ubyte v17, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ubyte v18, v[2:3], off offset:17
-; CHECK-NEXT:    global_load_ubyte v19, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ubyte v20, v[2:3], off offset:15
-; CHECK-NEXT:    global_load_ubyte v21, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ubyte v22, v[2:3], off offset:13
-; CHECK-NEXT:    global_load_ubyte v23, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ubyte v24, v[2:3], off offset:11
-; CHECK-NEXT:    global_load_ubyte v25, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ubyte v26, v[2:3], off offset:9
-; CHECK-NEXT:    global_load_ubyte v27, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ubyte v28, v[2:3], off offset:7
-; CHECK-NEXT:    global_load_ubyte v29, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ubyte v30, v[2:3], off offset:5
-; CHECK-NEXT:    global_load_ubyte v31, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ubyte v32, v[2:3], off offset:3
-; CHECK-NEXT:    global_load_ubyte v33, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ubyte v34, v[2:3], off offset:1
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v34 offset:1
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1856,31 +747,9 @@ define void @memmove_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:2
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1892,55 +761,19 @@ define void @memmove_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
+; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
+; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1952,55 +785,13 @@ define void @memmove_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p4_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v5, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_ushort v6, v[2:3], off offset:26
-; CHECK-NEXT:    global_load_ushort v7, v[2:3], off offset:24
-; CHECK-NEXT:    global_load_ushort v8, v[2:3], off offset:22
-; CHECK-NEXT:    global_load_ushort v9, v[2:3], off offset:20
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:18
-; CHECK-NEXT:    global_load_ushort v11, v[2:3], off offset:16
-; CHECK-NEXT:    global_load_ushort v12, v[2:3], off offset:14
-; CHECK-NEXT:    global_load_ushort v13, v[2:3], off offset:12
-; CHECK-NEXT:    global_load_ushort v14, v[2:3], off offset:10
-; CHECK-NEXT:    global_load_ushort v15, v[2:3], off offset:8
-; CHECK-NEXT:    global_load_ushort v16, v[2:3], off offset:6
-; CHECK-NEXT:    global_load_ushort v17, v[2:3], off offset:4
-; CHECK-NEXT:    global_load_ushort v18, v[2:3], off offset:2
-; CHECK-NEXT:    global_load_ushort v2, v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v18 offset:2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2129,55 +920,13 @@ define void @memmove_p0_p5_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2189,100 +938,23 @@ define void @memmove_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:6
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:5
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:1
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2294,103 +966,19 @@ define void @memmove_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_byte v[0:1], v22 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_byte v[0:1], v23 offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_byte v[0:1], v24 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_byte v[0:1], v25 offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_byte v[0:1], v26 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_byte v[0:1], v27 offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_byte v[0:1], v28 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_byte v[0:1], v29 offset:5
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_byte v[0:1], v30 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v31 offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_byte v[0:1], v32 offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_byte v[0:1], v33 offset:1
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2402,31 +990,13 @@ define void @memmove_p0_p5_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:2
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2438,55 +1008,23 @@ define void @memmove_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:30
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2498,55 +1036,19 @@ define void @memmove_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v2, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    flat_store_short v[0:1], v3 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    flat_store_short v[0:1], v5 offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    flat_store_short v[0:1], v6 offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    flat_store_short v[0:1], v7 offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    flat_store_short v[0:1], v8 offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    flat_store_short v[0:1], v9 offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    flat_store_short v[0:1], v12 offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    flat_store_short v[0:1], v13 offset:10
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    flat_store_short v[0:1], v14 offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v15 offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v16 offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_short v[0:1], v17 offset:2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v2
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2698,41 +1200,8 @@ define void @memmove_p1_p0_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v10
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v11, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v12, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 8, v17
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v18, 8, v2
-; CHECK-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v7
-; CHECK-NEXT:    v_lshl_or_b32 v4, v10, 16, v9
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2744,79 +1213,18 @@ define void @memmove_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v8, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(23)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v10, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshlrev_b16 v12, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v14, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(17)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v16, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v16, v6, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v18, 8, v19
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v20, 8, v21
-; CHECK-NEXT:    v_lshl_or_b32 v8, v14, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v22, 8, v23
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v24, 8, v25
-; CHECK-NEXT:    v_lshl_or_b32 v6, v15, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v20, v26, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v28, 8, v29
-; CHECK-NEXT:    v_lshl_or_b32 v4, v18, 16, v17
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v21, v30, 8, v31
+; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v22, v32, 8, v33
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v12, v12, v2
-; CHECK-NEXT:    v_lshl_or_b32 v3, v20, 16, v19
-; CHECK-NEXT:    v_lshl_or_b32 v2, v22, 16, v21
-; CHECK-NEXT:    global_store_byte v[0:1], v13, off offset:30
-; CHECK-NEXT:    global_store_short v[0:1], v12, off offset:28
 ; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2828,79 +1236,13 @@ define void @memmove_p1_p0_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:29
-; CHECK-NEXT:    flat_load_ubyte v5, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ubyte v6, v[2:3] offset:31
-; CHECK-NEXT:    flat_load_ubyte v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ubyte v8, v[2:3] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ubyte v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ubyte v11, v[2:3] offset:27
-; CHECK-NEXT:    flat_load_ubyte v12, v[2:3] offset:15
-; CHECK-NEXT:    flat_load_ubyte v13, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ubyte v14, v[2:3] offset:13
-; CHECK-NEXT:    flat_load_ubyte v15, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ubyte v16, v[2:3] offset:23
-; CHECK-NEXT:    flat_load_ubyte v17, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ubyte v18, v[2:3] offset:21
-; CHECK-NEXT:    flat_load_ubyte v19, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:19
-; CHECK-NEXT:    flat_load_ubyte v21, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ubyte v22, v[2:3] offset:17
-; CHECK-NEXT:    flat_load_ubyte v23, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ubyte v24, v[2:3] offset:11
-; CHECK-NEXT:    flat_load_ubyte v25, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ubyte v26, v[2:3] offset:9
-; CHECK-NEXT:    flat_load_ubyte v27, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ubyte v28, v[2:3] offset:7
-; CHECK-NEXT:    flat_load_ubyte v29, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ubyte v30, v[2:3] offset:5
-; CHECK-NEXT:    flat_load_ubyte v31, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ubyte v32, v[2:3] offset:1
-; CHECK-NEXT:    flat_load_ubyte v33, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v34, v[2:3] offset:3
-; CHECK-NEXT:    flat_load_ubyte v2, v[2:3] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 8, v10
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
-; CHECK-NEXT:    v_lshl_or_b32 v6, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v12, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v14, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v18, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v20, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v22, 8, v23
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v24, 8, v25
-; CHECK-NEXT:    v_lshl_or_b32 v9, v12, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v26, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v28, 8, v29
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v30, 8, v31
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v32, 8, v33
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v34, 8, v2
-; CHECK-NEXT:    v_lshl_or_b32 v2, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v6, v19, 16, v18
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -2911,23 +1253,8 @@ define void @memmove_p1_p0_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3]
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v4, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v9, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v6
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v7
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2939,41 +1266,18 @@ define void @memmove_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v19, v[2:3]
-; CHECK-NEXT:    flat_load_ubyte v20, v[2:3] offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v6, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v12, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v14, 16, v15
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
+; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v16, 16, v17
+; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v18, 16, v19
-; CHECK-NEXT:    global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_byte v[0:1], v20, off offset:30
 ; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2985,39 +1289,13 @@ define void @memmove_p1_p0_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v4, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v5, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_ushort v6, v[2:3] offset:26
-; CHECK-NEXT:    flat_load_ushort v7, v[2:3] offset:14
-; CHECK-NEXT:    flat_load_ushort v8, v[2:3] offset:12
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:10
-; CHECK-NEXT:    flat_load_ushort v11, v[2:3] offset:8
-; CHECK-NEXT:    flat_load_ushort v9, v[2:3] offset:24
-; CHECK-NEXT:    flat_load_ushort v12, v[2:3] offset:22
-; CHECK-NEXT:    flat_load_ushort v13, v[2:3] offset:20
-; CHECK-NEXT:    flat_load_ushort v14, v[2:3] offset:18
-; CHECK-NEXT:    flat_load_ushort v15, v[2:3] offset:16
-; CHECK-NEXT:    flat_load_ushort v16, v[2:3] offset:6
-; CHECK-NEXT:    flat_load_ushort v17, v[2:3] offset:4
-; CHECK-NEXT:    flat_load_ushort v18, v[2:3] offset:2
-; CHECK-NEXT:    flat_load_ushort v19, v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v4, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v6, 16, v9
-; CHECK-NEXT:    v_lshl_or_b32 v9, v7, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v12, 16, v13
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v14, 16, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v16, 16, v17
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[8:11], v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v18, 16, v19
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -3783,44 +2061,13 @@ define void @memmove_p1_p5_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v16, 8, v15
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v2, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -3831,82 +2078,24 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshlrev_b16 v4, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
-; CHECK-NEXT:    v_lshl_or_b32 v8, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v15, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v2, v9, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v19, 8, v18
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v21, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v23, 8, v22
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v26, 8, v25
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v24, 8, v28
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v27
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v31, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    v_or_b32_e32 v18, v4, v32
-; CHECK-NEXT:    v_lshl_or_b32 v4, v13, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v7, v15, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 16, v17
+; CHECK-NEXT:    global_store_byte v[0:1], v11, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_byte v[0:1], v33, off offset:30
-; CHECK-NEXT:    global_store_short v[0:1], v18, off offset:28
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -3917,81 +2106,19 @@ define void @memmove_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v4, 8, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v6, 8, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v14, 8, v13
-; CHECK-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 8, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v18, 8, v17
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v21, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v23, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v19, 8, v25
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v26, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v28, 8, v27
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v30, 8, v29
-; CHECK-NEXT:    v_lshl_or_b32 v7, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v32, 8, v31
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v19, v2, 8, v33
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v5, v9, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v6, v14, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 16, v16
-; CHECK-NEXT:    v_lshl_or_b32 v8, v19, 16, v18
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -4002,24 +2129,13 @@ define void @memmove_p1_p5_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v7
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v10, 16, v9
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -4030,43 +2146,24 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v18, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v7
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v15, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v17, 16, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    global_store_short v[0:1], v18, off offset:28
+; CHECK-NEXT:    global_store_short v[0:1], v10, off offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
+; CHECK-NEXT:    global_store_byte v[0:1], v11, off offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_byte v[0:1], v19, off offset:30
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[6:8], off offset:16
+; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4077,41 +2174,19 @@ define void @memmove_p1_p5_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p1_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v4, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v12, v2, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v13, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v14, v2, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v15, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v16, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v17, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v18, v2, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v6, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v14, 16, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v16, 16, v15
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v18, 16, v17
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[6:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -4258,41 +2333,8 @@ define void @memmove_p3_p0_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v9
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 8, v16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v17, 8, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
 ; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -4305,82 +2347,20 @@ define void @memmove_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(27) lgkmcnt(27)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v5, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v7, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(23)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v9, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshlrev_b16 v11, 8, v11
-; CHECK-NEXT:    v_lshl_or_b32 v4, v3, 16, v5
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(17)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v17, 8, v18
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v19, 8, v20
-; CHECK-NEXT:    v_lshl_or_b32 v13, v13, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v21, 8, v22
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v23, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v25, 8, v26
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v28
-; CHECK-NEXT:    v_lshl_or_b32 v3, v14, 16, v6
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v29, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v31, 8, v32
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_or_b32_e32 v11, v11, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v10, 16, v8
-; CHECK-NEXT:    v_lshl_or_b32 v6, v16, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v5, v18, 16, v17
-; CHECK-NEXT:    ds_write_b8 v0, v12 offset:30
-; CHECK-NEXT:    ds_write_b32 v0, v13 offset:24
-; CHECK-NEXT:    ds_write_b16 v0, v11 offset:28
-; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:16
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write_b8 v0, v8 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    ds_write_b16 v0, v9 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    ds_write_b32 v0, v7 offset:24
+; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4392,79 +2372,13 @@ define void @memmove_p3_p0_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:31
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v33, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2] offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 8, v9
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v10, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v11, 8, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v14
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v15, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v17, 8, v18
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v19, 8, v20
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v21, 8, v22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v23, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v25, 8, v26
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v28
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v29, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v31, 8, v32
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v33, 8, v1
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
-; CHECK-NEXT:    v_lshl_or_b32 v6, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
-; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4476,23 +2390,8 @@ define void @memmove_p3_p0_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2]
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v8, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v5
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v6
 ; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -4505,43 +2404,20 @@ define void @memmove_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2]
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v3, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v5, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v8, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v10, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v12, 16, v13
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v14, 16, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v16, 16, v17
-; CHECK-NEXT:    ds_write_b16 v0, v7 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
-; CHECK-NEXT:    ds_write_b8 v0, v18 offset:30
-; CHECK-NEXT:    ds_write_b32 v0, v8 offset:24
-; CHECK-NEXT:    ds_write_b64 v0, v[1:2] offset:16
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write_b8 v0, v8 offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
+; CHECK-NEXT:    ds_write_b16 v0, v9 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
+; CHECK-NEXT:    ds_write_b32 v0, v7 offset:24
+; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -4553,40 +2429,13 @@ define void @memmove_p3_p0_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v18, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v3, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v6, 16, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v5, v8, 16, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v13, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v18
-; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3
-; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[5:6] offset1:1
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v0, v[7:8], v[9:10] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5398,44 +3247,13 @@ define void @memmove_p3_p5_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v15, 8, v14
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v1, 8, v16
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5447,83 +3265,26 @@ define void @memmove_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshlrev_b16 v3, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v8, 8, v7
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v12, 8, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v14, 8, v13
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v16, 8, v15
-; CHECK-NEXT:    v_lshl_or_b32 v16, v2, 16, v1
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v18, 8, v17
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v20, 8, v19
-; CHECK-NEXT:    v_lshl_or_b32 v1, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v22, 8, v21
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(8)
+; CHECK-NEXT:    ds_write_b32 v0, v8 offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v23, 8, v27
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v28, 8, v26
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v8
+; CHECK-NEXT:    ds_write_b16 v0, v9 offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    ds_write_b8 v0, v10 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v30, 8, v29
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    v_or_b32_e32 v17, v3, v31
-; CHECK-NEXT:    v_lshl_or_b32 v3, v11, 16, v10
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v12
-; CHECK-NEXT:    v_lshl_or_b32 v5, v14, 16, v15
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b8 v0, v32 offset:30
-; CHECK-NEXT:    ds_write_b32 v0, v16 offset:24
-; CHECK-NEXT:    ds_write_b16 v0, v17 offset:28
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write_b64 v0, v[6:7] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5535,81 +3296,19 @@ define void @memmove_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v3, 8, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v5, 8, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v7, 8, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v9, 8, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    v_lshl_or_b32 v9, v11, 8, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    v_lshl_or_b32 v10, v13, 8, v12
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v15, 8, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v17, 8, v16
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    v_lshl_or_b32 v12, v20, 8, v19
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    v_lshl_or_b32 v14, v22, 8, v21
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    v_lshl_or_b32 v11, v18, 8, v24
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v13, v25, 8, v23
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v15, v27, 8, v26
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v16, v29, 8, v28
-; CHECK-NEXT:    v_lshl_or_b32 v6, v12, 16, v11
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v17, v31, 8, v30
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v1, 8, v32
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    v_lshl_or_b32 v4, v8, 16, v5
-; CHECK-NEXT:    v_lshl_or_b32 v3, v10, 16, v9
-; CHECK-NEXT:    v_lshl_or_b32 v5, v13, 16, v14
-; CHECK-NEXT:    v_lshl_or_b32 v8, v16, 16, v15
-; CHECK-NEXT:    v_lshl_or_b32 v7, v18, 16, v17
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5621,24 +3320,13 @@ define void @memmove_p3_p5_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v7, 16, v6
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5650,44 +3338,26 @@ define void @memmove_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v18, v3, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v7, 16, v6
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v9, 16, v8
+; CHECK-NEXT:    ds_write_b32 v0, v8 offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(7)
+; CHECK-NEXT:    ds_write_b16 v0, v9 offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v11, 16, v10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v12
+; CHECK-NEXT:    ds_write_b8 v0, v10 offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v15, 16, v14
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    ds_write_b16 v0, v16 offset:28
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ds_write_b8 v0, v17 offset:30
-; CHECK-NEXT:    ds_write_b32 v0, v18 offset:24
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write_b64 v0, v[5:6] offset:16
+; CHECK-NEXT:    ds_write_b64 v0, v[6:7] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5699,41 +3369,19 @@ define void @memmove_p3_p5_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p3_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v17, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    v_lshl_or_b32 v2, v5, 16, v2
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    v_lshl_or_b32 v3, v7, 16, v6
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    v_lshl_or_b32 v4, v9, 16, v8
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    v_lshl_or_b32 v5, v11, 16, v10
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    v_lshl_or_b32 v6, v13, 16, v12
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    v_lshl_or_b32 v8, v15, 16, v14
+; CHECK-NEXT:    ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_lshl_or_b32 v7, v17, 16, v16
-; CHECK-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
-; CHECK-NEXT:    ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3
+; CHECK-NEXT:    ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -5890,55 +3538,12 @@ define void @memmove_p5_p0_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -5949,100 +3554,24 @@ define void @memmove_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(30) lgkmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28) lgkmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27) lgkmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26) lgkmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21) lgkmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -6053,103 +3582,19 @@ define void @memmove_p5_p0_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:31
-; CHECK-NEXT:    flat_load_ubyte v4, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ubyte v5, v[1:2] offset:29
-; CHECK-NEXT:    flat_load_ubyte v6, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ubyte v7, v[1:2] offset:27
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ubyte v9, v[1:2] offset:25
-; CHECK-NEXT:    flat_load_ubyte v10, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ubyte v11, v[1:2] offset:23
-; CHECK-NEXT:    flat_load_ubyte v12, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ubyte v13, v[1:2] offset:21
-; CHECK-NEXT:    flat_load_ubyte v14, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ubyte v15, v[1:2] offset:19
-; CHECK-NEXT:    flat_load_ubyte v16, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ubyte v17, v[1:2] offset:17
-; CHECK-NEXT:    flat_load_ubyte v18, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ubyte v19, v[1:2] offset:15
-; CHECK-NEXT:    flat_load_ubyte v20, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ubyte v21, v[1:2] offset:13
-; CHECK-NEXT:    flat_load_ubyte v22, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ubyte v23, v[1:2] offset:11
-; CHECK-NEXT:    flat_load_ubyte v24, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ubyte v25, v[1:2] offset:9
-; CHECK-NEXT:    flat_load_ubyte v26, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ubyte v27, v[1:2] offset:7
-; CHECK-NEXT:    flat_load_ubyte v28, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ubyte v29, v[1:2] offset:5
-; CHECK-NEXT:    flat_load_ubyte v30, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ubyte v31, v[1:2] offset:3
-; CHECK-NEXT:    flat_load_ubyte v32, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ubyte v33, v[1:2] offset:1
-; CHECK-NEXT:    flat_load_ubyte v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(31) lgkmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30) lgkmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29) lgkmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28) lgkmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27) lgkmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26) lgkmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25) lgkmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24) lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23) lgkmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22) lgkmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21) lgkmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20) lgkmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19) lgkmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18) lgkmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17) lgkmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16) lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -6160,31 +3605,12 @@ define void @memmove_p5_p0_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -6195,55 +3621,24 @@ define void @memmove_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ubyte v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
+; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
+; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -6254,55 +3649,19 @@ define void @memmove_p5_p0_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p0_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    flat_load_ushort v3, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v4, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_ushort v5, v[1:2] offset:26
-; CHECK-NEXT:    flat_load_ushort v6, v[1:2] offset:24
-; CHECK-NEXT:    flat_load_ushort v7, v[1:2] offset:22
-; CHECK-NEXT:    flat_load_ushort v8, v[1:2] offset:20
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:18
-; CHECK-NEXT:    flat_load_ushort v10, v[1:2] offset:16
-; CHECK-NEXT:    flat_load_ushort v11, v[1:2] offset:14
-; CHECK-NEXT:    flat_load_ushort v12, v[1:2] offset:12
-; CHECK-NEXT:    flat_load_ushort v13, v[1:2] offset:10
-; CHECK-NEXT:    flat_load_ushort v14, v[1:2] offset:8
-; CHECK-NEXT:    flat_load_ushort v15, v[1:2] offset:6
-; CHECK-NEXT:    flat_load_ushort v16, v[1:2] offset:4
-; CHECK-NEXT:    flat_load_ushort v17, v[1:2] offset:2
-; CHECK-NEXT:    flat_load_ushort v1, v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(15) lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14) lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13) lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12) lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11) lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10) lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9) lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8) lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6) lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5) lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4) lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    flat_load_dwordx4 v[3:6], v[1:2] offset:16
+; CHECK-NEXT:    flat_load_dwordx4 v[7:10], v[1:2]
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -6452,55 +3811,12 @@ define void @memmove_p5_p1_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -6511,100 +3827,24 @@ define void @memmove_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -6615,103 +3855,19 @@ define void @memmove_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:31
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v33, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -6722,31 +3878,12 @@ define void @memmove_p5_p1_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -6757,55 +3894,24 @@ define void @memmove_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -6816,55 +3922,19 @@ define void @memmove_p5_p1_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p1_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -7009,54 +4079,12 @@ define void @memmove_p5_p3_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:15
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:7
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -7067,72 +4095,25 @@ define void @memmove_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:24
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:25
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:27
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:29
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:30
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:16
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:17
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:18
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:19
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:20
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:21
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:22
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:23
-; CHECK-NEXT:    ds_read_u8 v17, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v18, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v19, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v20, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v21, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v22, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v23, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v24, v1 offset:15
-; CHECK-NEXT:    ds_read_u8 v25, v1
-; CHECK-NEXT:    ds_read_u8 v26, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v27, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v28, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v29, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v30, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v31, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v1, v1 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen
+; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
+; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
+; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -7143,74 +4124,18 @@ define void @memmove_p5_p3_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:24
-; CHECK-NEXT:    ds_read_u8 v3, v1 offset:25
-; CHECK-NEXT:    ds_read_u8 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u8 v5, v1 offset:27
-; CHECK-NEXT:    ds_read_u8 v6, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v7, v1 offset:29
-; CHECK-NEXT:    ds_read_u8 v8, v1 offset:30
-; CHECK-NEXT:    ds_read_u8 v9, v1 offset:31
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:16
-; CHECK-NEXT:    ds_read_u8 v11, v1 offset:17
-; CHECK-NEXT:    ds_read_u8 v12, v1 offset:18
-; CHECK-NEXT:    ds_read_u8 v13, v1 offset:19
-; CHECK-NEXT:    ds_read_u8 v14, v1 offset:20
-; CHECK-NEXT:    ds_read_u8 v15, v1 offset:21
-; CHECK-NEXT:    ds_read_u8 v16, v1 offset:22
-; CHECK-NEXT:    ds_read_u8 v17, v1 offset:23
-; CHECK-NEXT:    ds_read_u8 v18, v1 offset:8
-; CHECK-NEXT:    ds_read_u8 v19, v1 offset:9
-; CHECK-NEXT:    ds_read_u8 v20, v1 offset:10
-; CHECK-NEXT:    ds_read_u8 v21, v1 offset:11
-; CHECK-NEXT:    ds_read_u8 v22, v1 offset:12
-; CHECK-NEXT:    ds_read_u8 v23, v1 offset:13
-; CHECK-NEXT:    ds_read_u8 v24, v1 offset:14
-; CHECK-NEXT:    ds_read_u8 v25, v1 offset:15
-; CHECK-NEXT:    ds_read_u8 v26, v1
-; CHECK-NEXT:    ds_read_u8 v27, v1 offset:1
-; CHECK-NEXT:    ds_read_u8 v28, v1 offset:2
-; CHECK-NEXT:    ds_read_u8 v29, v1 offset:3
-; CHECK-NEXT:    ds_read_u8 v30, v1 offset:4
-; CHECK-NEXT:    ds_read_u8 v31, v1 offset:5
-; CHECK-NEXT:    ds_read_u8 v32, v1 offset:6
-; CHECK-NEXT:    ds_read_u8 v1, v1 offset:7
-; CHECK-NEXT:    s_waitcnt lgkmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[6:9], v1 offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -7221,30 +4146,12 @@ define void @memmove_p5_p3_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v2, v1
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v1, v1 offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    ds_read2_b64 v[1:4], v1 offset1:1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -7255,54 +4162,25 @@ define void @memmove_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u8 v2, v1 offset:30
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:28
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:22
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:20
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:18
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:16
-; CHECK-NEXT:    ds_read_u16 v10, v1 offset:14
-; CHECK-NEXT:    ds_read_u16 v11, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v12, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v13, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v14, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v15, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v16, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
+; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
+; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -7313,54 +4191,18 @@ define void @memmove_p5_p3_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p3_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_u16 v2, v1 offset:30
-; CHECK-NEXT:    ds_read_u16 v3, v1 offset:28
-; CHECK-NEXT:    ds_read_u16 v4, v1 offset:26
-; CHECK-NEXT:    ds_read_u16 v5, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v6, v1 offset:22
-; CHECK-NEXT:    ds_read_u16 v7, v1 offset:20
-; CHECK-NEXT:    ds_read_u16 v8, v1 offset:18
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:16
-; CHECK-NEXT:    ds_read_u16 v10, v1 offset:14
-; CHECK-NEXT:    ds_read_u16 v11, v1 offset:12
-; CHECK-NEXT:    ds_read_u16 v12, v1 offset:10
-; CHECK-NEXT:    ds_read_u16 v13, v1 offset:8
-; CHECK-NEXT:    ds_read_u16 v14, v1 offset:6
-; CHECK-NEXT:    ds_read_u16 v15, v1 offset:4
-; CHECK-NEXT:    ds_read_u16 v16, v1 offset:2
-; CHECK-NEXT:    ds_read_u16 v1, v1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(15)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt lgkmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt lgkmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt lgkmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt lgkmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt lgkmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
+; CHECK-NEXT:    ds_read2_b64 v[6:9], v1 offset0:2 offset1:3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -7505,55 +4347,12 @@ define void @memmove_p5_p4_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -7564,100 +4363,24 @@ define void @memmove_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -7668,103 +4391,19 @@ define void @memmove_p5_p4_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:31
-; CHECK-NEXT:    global_load_ubyte v4, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ubyte v5, v[1:2], off offset:29
-; CHECK-NEXT:    global_load_ubyte v6, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v7, v[1:2], off offset:27
-; CHECK-NEXT:    global_load_ubyte v8, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:25
-; CHECK-NEXT:    global_load_ubyte v10, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ubyte v11, v[1:2], off offset:23
-; CHECK-NEXT:    global_load_ubyte v12, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ubyte v13, v[1:2], off offset:21
-; CHECK-NEXT:    global_load_ubyte v14, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ubyte v15, v[1:2], off offset:19
-; CHECK-NEXT:    global_load_ubyte v16, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ubyte v17, v[1:2], off offset:17
-; CHECK-NEXT:    global_load_ubyte v18, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ubyte v19, v[1:2], off offset:15
-; CHECK-NEXT:    global_load_ubyte v20, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ubyte v21, v[1:2], off offset:13
-; CHECK-NEXT:    global_load_ubyte v22, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ubyte v23, v[1:2], off offset:11
-; CHECK-NEXT:    global_load_ubyte v24, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ubyte v25, v[1:2], off offset:9
-; CHECK-NEXT:    global_load_ubyte v26, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ubyte v27, v[1:2], off offset:7
-; CHECK-NEXT:    global_load_ubyte v28, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ubyte v29, v[1:2], off offset:5
-; CHECK-NEXT:    global_load_ubyte v30, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ubyte v31, v[1:2], off offset:3
-; CHECK-NEXT:    global_load_ubyte v32, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ubyte v33, v[1:2], off offset:1
-; CHECK-NEXT:    global_load_ubyte v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v33, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -7775,31 +4414,12 @@ define void @memmove_p5_p4_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -7810,55 +4430,24 @@ define void @memmove_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ubyte v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
+; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
+; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -7869,55 +4458,19 @@ define void @memmove_p5_p4_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p4_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    global_load_ushort v3, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_ushort v4, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ushort v5, v[1:2], off offset:26
-; CHECK-NEXT:    global_load_ushort v6, v[1:2], off offset:24
-; CHECK-NEXT:    global_load_ushort v7, v[1:2], off offset:22
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:20
-; CHECK-NEXT:    global_load_ushort v9, v[1:2], off offset:18
-; CHECK-NEXT:    global_load_ushort v10, v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v11, v[1:2], off offset:14
-; CHECK-NEXT:    global_load_ushort v12, v[1:2], off offset:12
-; CHECK-NEXT:    global_load_ushort v13, v[1:2], off offset:10
-; CHECK-NEXT:    global_load_ushort v14, v[1:2], off offset:8
-; CHECK-NEXT:    global_load_ushort v15, v[1:2], off offset:6
-; CHECK-NEXT:    global_load_ushort v16, v[1:2], off offset:4
-; CHECK-NEXT:    global_load_ushort v17, v[1:2], off offset:2
-; CHECK-NEXT:    global_load_ushort v1, v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v17, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false)
@@ -8062,55 +4615,19 @@ define void @memmove_p5_p5_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false)
@@ -8121,100 +4638,34 @@ define void @memmove_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1e
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:9
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -8225,103 +4676,31 @@ define void @memmove_p5_p5_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x1f
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:7
-; CHECK-NEXT:    buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:5
-; CHECK-NEXT:    buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:3
-; CHECK-NEXT:    buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:1
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(31)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:31
-; CHECK-NEXT:    s_waitcnt vmcnt(30)
-; CHECK-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(29)
-; CHECK-NEXT:    buffer_store_byte v4, v0, s[0:3], 0 offen offset:29
-; CHECK-NEXT:    s_waitcnt vmcnt(28)
-; CHECK-NEXT:    buffer_store_byte v5, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(27)
-; CHECK-NEXT:    buffer_store_byte v6, v0, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(26)
-; CHECK-NEXT:    buffer_store_byte v7, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(25)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:25
-; CHECK-NEXT:    s_waitcnt vmcnt(24)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(23)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:23
-; CHECK-NEXT:    s_waitcnt vmcnt(22)
-; CHECK-NEXT:    buffer_store_byte v11, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(21)
-; CHECK-NEXT:    buffer_store_byte v12, v0, s[0:3], 0 offen offset:21
-; CHECK-NEXT:    s_waitcnt vmcnt(20)
-; CHECK-NEXT:    buffer_store_byte v13, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(19)
-; CHECK-NEXT:    buffer_store_byte v14, v0, s[0:3], 0 offen offset:19
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
-; CHECK-NEXT:    buffer_store_byte v15, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(17)
-; CHECK-NEXT:    buffer_store_byte v16, v0, s[0:3], 0 offen offset:17
-; CHECK-NEXT:    s_waitcnt vmcnt(16)
-; CHECK-NEXT:    buffer_store_byte v17, v0, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v18, v0, s[0:3], 0 offen offset:15
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_byte v19, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_byte v20, v0, s[0:3], 0 offen offset:13
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_byte v21, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_byte v22, v0, s[0:3], 0 offen offset:11
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_byte v23, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_byte v24, v0, s[0:3], 0 offen offset:9
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_byte v25, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_byte v26, v0, s[0:3], 0 offen offset:7
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_byte v27, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_byte v28, v0, s[0:3], 0 offen offset:5
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_byte v29, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v30, v0, s[0:3], 0 offen offset:3
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v31, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v32, v0, s[0:3], 0 offen offset:1
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false)
@@ -8332,31 +4711,19 @@ define void @memmove_p5_p5_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    s_clause 0x3
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false)
@@ -8367,55 +4734,34 @@ define void @memmove_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
+; CHECK-NEXT:    s_clause 0x8
+; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -8426,55 +4772,31 @@ define void @memmove_p5_p5_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p5_p5_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0xf
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14
-; CHECK-NEXT:    buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10
-; CHECK-NEXT:    buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6
-; CHECK-NEXT:    buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2
-; CHECK-NEXT:    buffer_load_ushort v1, v1, s[0:3], 0 offen
-; CHECK-NEXT:    s_waitcnt vmcnt(15)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(14)
-; CHECK-NEXT:    buffer_store_short v3, v0, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(13)
-; CHECK-NEXT:    buffer_store_short v4, v0, s[0:3], 0 offen offset:26
-; CHECK-NEXT:    s_waitcnt vmcnt(12)
-; CHECK-NEXT:    buffer_store_short v5, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(11)
-; CHECK-NEXT:    buffer_store_short v6, v0, s[0:3], 0 offen offset:22
-; CHECK-NEXT:    s_waitcnt vmcnt(10)
-; CHECK-NEXT:    buffer_store_short v7, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    s_waitcnt vmcnt(9)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:18
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_short v10, v0, s[0:3], 0 offen offset:14
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_short v11, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_short v12, v0, s[0:3], 0 offen offset:10
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_short v13, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_short v14, v0, s[0:3], 0 offen offset:6
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v15, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_short v16, v0, s[0:3], 0 offen offset:2
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
index 94bc6d46b2395b..8ad6a4e534d232 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
@@ -19,8 +19,8 @@ define void @extracted_values(ptr %ret_struct, ptr addrspace(3) %arg0, ptr addrs
 ; CHECK-NEXT:    v_sub_f16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; CHECK-NEXT:    v_add_f16_e32 v4, v6, v7
 ; CHECK-NEXT:    v_add_f16_e32 v2, v3, v2
-; CHECK-NEXT:    flat_store_short v[0:1], v4
-; CHECK-NEXT:    flat_store_short v[0:1], v2 offset:2
+; CHECK-NEXT:    v_pack_b32_f16 v2, v4, v2
+; CHECK-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:

>From d56a384e1ab31ed9864ffac7469112125f1c362a Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Mon, 30 Sep 2024 11:34:19 -0400
Subject: [PATCH 2/3] fixup! [AMDGPU] Enable unaligned scratch accesses

make flat scratch not imply that unaligned scratch accesses are valid
---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |    3 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |    3 +-
 .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 1204 +++++-
 .../AMDGPU/GlobalISel/legalize-load-flat.mir  | 3216 ++++++++++++--
 .../GlobalISel/legalize-load-private.mir      | 3825 +++++++++++++++--
 llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll    |   51 +-
 .../CodeGen/AMDGPU/unaligned-load-store.ll    |   28 +-
 7 files changed, 7564 insertions(+), 766 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index cbf6b7d60f0366..d701bf037fdfa6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -387,8 +387,7 @@ bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
   // them later if they may access private memory. We don't have enough context
   // here, and legalization can handle it.
   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
-    return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled() ||
-            ST->enableFlatScratch()) &&
+    return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
            ChainSizeInBytes <= ST->getMaxPrivateElementSize();
   }
   return true;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 44ca5c5ac610b3..8ba5f64f92a617 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1833,8 +1833,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
     if (IsFast)
       *IsFast = AlignedBy4;
 
-    return AlignedBy4 || Subtarget->enableFlatScratch() ||
-           Subtarget->hasUnalignedScratchAccessEnabled();
+    return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
   }
 
   // So long as they are correct, wide global memory operations perform better
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index ce528467cd35b4..f2ade4df0f42d2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -2416,23 +2416,93 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, 15
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b8 v0, v1, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:7 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX9-LABEL: store_load_i64_unaligned:
 ; UNALIGNED_GFX9:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; UNALIGNED_GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v4, 15
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v1, 4, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v0, v4, off
 ; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
-; UNALIGNED_GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v6, 6, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v3, v4, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v5, 3, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v2, v4, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v5, v4, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v7, 5, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v1, v4, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v7, v4, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v8, 7, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v6, v4, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v8, v4, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v4, v0, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr6
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr1
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr3
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr5
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr8
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr7
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr2
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr0
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v4, v3, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v4, v2, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v4, v5, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v4, v1, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v4, v7, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v4, v6, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v4, v8, off glc
 ; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2441,30 +2511,143 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; UNALIGNED_GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v3, 4, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v5, 2, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v0, v1, off
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; UNALIGNED_GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, 3, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v6, 5, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v7, 6, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v8, 7, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v4, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v5, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v1, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v3, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v6, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v7, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v8, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v4, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v5, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v1, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v3, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v6, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v7, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v8, off glc dlc
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX940-LABEL: store_load_i64_unaligned:
 ; UNALIGNED_GFX940:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX940-NEXT:    v_mov_b64_e32 v[2:3], 15
-; UNALIGNED_GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v4, 15
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v1, 4, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v2, 2, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v3, 1, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v0, v4, off sc0 sc1
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
-; UNALIGNED_GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v4, 0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v6, 6, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v3, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v5, 3, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v2, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v5, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v7, 5, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v1, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v7, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v8, 7, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v6, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v8, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v4, v0, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr6
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr1
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr3
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr5
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr8
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr7
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr2
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr0
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v4, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v4, v2, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v4, v5, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v4, v1, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v4, v7, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v4, v6, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v4, v8, off sc0 sc1
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX11-LABEL: store_load_i64_unaligned:
 ; UNALIGNED_GFX11:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; UNALIGNED_GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v4, 1, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v3, 4, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v5, 2, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v0, v1, off dlc
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; UNALIGNED_GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v6, 5, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v7, 6, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v8, 7, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v4, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v5, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v1, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v3, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v6, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v7, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v8, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v0, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v4, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v5, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v1, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v3, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v6, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v7, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v8, off glc dlc
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2475,12 +2658,39 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
-; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v1, 15
-; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
-; UNALIGNED_GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v1, off scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
-; UNALIGNED_GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:4 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:7 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -2556,75 +2766,350 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s2, 3
-; GFX12-NEXT:    s_mov_b32 s1, 2
-; GFX12-NEXT:    s_mov_b32 s0, 1
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
-; GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b8 v0, v1, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 3
+; GFX12-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:8 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:9 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:10 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:11 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:7 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:8 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:9 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:10 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:11 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX9-LABEL: store_load_v3i32_unaligned:
 ; UNALIGNED_GFX9:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX9-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX9-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX9-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; UNALIGNED_GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v3, 1
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, 2
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v0, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v6, 4, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v7, 6, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v9, 8, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v10, 10, v0
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v12, 3
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v4, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v5, 3, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v2, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v5, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v8, 5, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v6, v1, off
 ; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
-; UNALIGNED_GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v8, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v1, 7, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v7, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v1, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v11, 9, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v9, v12, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v11, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v12, 11, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v10, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v12, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v0, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr5
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr9
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr1
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr8
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr2
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr12
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr4
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr11
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr7
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr6
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr10
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr0
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v4, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v2, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v5, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v6, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v8, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v7, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v1, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v9, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v11, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v10, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v12, off glc
 ; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX10-LABEL: store_load_v3i32_unaligned:
 ; UNALIGNED_GFX10:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX10-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX10-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX10-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v3, s2
-; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, s1
-; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; UNALIGNED_GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, 1
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, 2
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v4, 2, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v0, v1, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, 3, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v6, 4, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v7, 5, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v5, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v4, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v8, 6, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v1, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v6, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v7, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v2, 7, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v9, 8, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v10, 3
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v11, 9, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v12, 10, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v13, 11, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v8, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v2, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v9, v10, off
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; UNALIGNED_GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v11, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v12, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v13, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v5, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v4, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v1, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v6, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v7, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v8, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v2, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v9, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v11, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v12, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v13, off glc dlc
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX940-LABEL: store_load_v3i32_unaligned:
 ; UNALIGNED_GFX940:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX940-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX940-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX940-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v4, s2
-; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v3, s1
-; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v2, s0
-; UNALIGNED_GFX940-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v3, 1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v1, 2
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v2, 2, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v4, 1, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v0, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v3, 0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v6, 4, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v7, 6, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v9, 8, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v10, 10, v0
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v12, 3
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v4, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v5, 3, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v2, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v5, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v8, 5, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v6, v1, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v8, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v1, 7, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v7, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v1, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v11, 9, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v9, v12, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v11, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v12, 11, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v10, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v12, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v0, off sc0 sc1
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
-; UNALIGNED_GFX940-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr5
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr9
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr1
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr8
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr2
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr12
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr4
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr11
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr7
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr6
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr10
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr0
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v2, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v5, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v6, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v8, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v7, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v1, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v9, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v11, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v10, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v12, off sc0 sc1
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX11-LABEL: store_load_v3i32_unaligned:
 ; UNALIGNED_GFX11:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX11-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX11-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX11-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
-; UNALIGNED_GFX11-NEXT:    v_mov_b32_e32 v1, s0
-; UNALIGNED_GFX11-NEXT:    scratch_store_b96 v0, v[1:3], off dlc
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_add_nc_u32 v4, 2, v0
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v10, 3 :: v_dual_add_nc_u32 v5, 1, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v0, v1, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v6, 4, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v7, 5, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v5, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v4, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v8, 6, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v1, v3, off dlc
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; UNALIGNED_GFX11-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v6, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v7, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v2, 7, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v9, 8, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v11, 9, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v12, 10, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v13, 11, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v8, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v2, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v9, v10, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v11, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v12, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v13, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v0, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v5, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v4, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v1, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v6, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v7, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v8, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v2, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v9, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v11, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v12, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v13, off glc dlc
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2635,16 +3120,57 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
-; UNALIGNED_GFX12-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX12-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX12-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX12-NEXT:    s_wait_alu 0xfffe
-; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
-; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v1, s0
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v3, 2
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v1, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v1, 3
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
-; UNALIGNED_GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:8 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
-; UNALIGNED_GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:9 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:10 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:11 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:7 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:8 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:9 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:10 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:11 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -2725,81 +3251,456 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_mov_b32 s3, 4
-; GFX12-NEXT:    s_mov_b32 s2, 3
-; GFX12-NEXT:    s_mov_b32 s1, 2
-; GFX12-NEXT:    s_mov_b32 s0, 1
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
-; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 0
+; GFX12-NEXT:    v_mov_b32_e32 v3, 2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v1, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 3
+; GFX12-NEXT:    v_mov_b32_e32 v3, 4
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:8 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:9 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:10 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:11 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b8 v0, v3, off offset:12 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:13 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:14 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:15 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:7 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:8 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:9 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:10 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:11 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:12 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:13 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:14 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:15 scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX9-LABEL: store_load_v4i32_unaligned:
 ; UNALIGNED_GFX9:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX9-NEXT:    s_mov_b32 s3, 4
-; UNALIGNED_GFX9-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX9-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX9-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v4, s3
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v2, s1
-; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; UNALIGNED_GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
-; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
-; UNALIGNED_GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v3, 1
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v1, 2
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v2, 2, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v0, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v6, 4
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v7, 4, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v8, 6, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v10, 8, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v11, 10, v0
+; UNALIGNED_GFX9-NEXT:    v_mov_b32_e32 v13, 3
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v14, 12, v0
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v15, 14, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v4, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v5, 3, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v2, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v5, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v9, 5, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v7, v1, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v9, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v1, 7, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v8, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v1, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v12, 9, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v10, v13, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v12, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v13, 11, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v11, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v13, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v16, 13, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v14, v6, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v16, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    v_add_u32_e32 v6, 15, v0
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v15, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_store_byte v6, v3, off
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v0, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v4, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v2, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v5, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v7, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v9, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v8, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v1, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v10, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v12, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v11, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v13, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v14, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v16, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v3, v15, off glc
+; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr11
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr4
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr15
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr10
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr7
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr13
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr5
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr14
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr12
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr8
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr2
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr1
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr9
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr16
+; UNALIGNED_GFX9-NEXT:    ; kill: killed $vgpr0
+; UNALIGNED_GFX9-NEXT:    scratch_load_ubyte v0, v6, off glc
 ; UNALIGNED_GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX10-LABEL: store_load_v4i32_unaligned:
 ; UNALIGNED_GFX10:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX10-NEXT:    s_mov_b32 s3, 4
-; UNALIGNED_GFX10-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX10-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX10-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v4, s3
-; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v3, s2
-; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, s1
-; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, s0
-; UNALIGNED_GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v1, 1
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v2, 2
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v6, 4, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v0, v1, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v1, 3, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v5, 2, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v7, 5, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v4, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v5, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v1, v3, off
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; UNALIGNED_GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v9, 6, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v6, v2, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v7, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v2, 7, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v10, 8, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v11, 3
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v12, 9, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v9, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v13, 10, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v2, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v10, v11, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v12, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v11, 11, v0
+; UNALIGNED_GFX10-NEXT:    v_mov_b32_e32 v8, 4
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v14, 12, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v15, 13, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v16, 14, v0
+; UNALIGNED_GFX10-NEXT:    v_add_nc_u32_e32 v17, 15, v0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v13, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v11, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v14, v8, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v15, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v16, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_store_byte v17, v3, off
+; UNALIGNED_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v4, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v5, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v1, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v6, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v7, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v9, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v2, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v10, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v12, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v13, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v11, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v14, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v15, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v16, off glc dlc
+; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX10-NEXT:    scratch_load_ubyte v0, v17, off glc dlc
 ; UNALIGNED_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX940-LABEL: store_load_v4i32_unaligned:
 ; UNALIGNED_GFX940:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX940-NEXT:    s_mov_b32 s3, 4
-; UNALIGNED_GFX940-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX940-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX940-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX940-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
-; UNALIGNED_GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; UNALIGNED_GFX940-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v3, 1
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v1, 2
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v2, 2, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v4, 1, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v0, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v3, 0
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v6, 4
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v7, 4, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v8, 6, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v10, 8, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v11, 10, v0
+; UNALIGNED_GFX940-NEXT:    v_mov_b32_e32 v13, 3
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v14, 12, v0
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v15, 14, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v4, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v5, 3, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v2, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v5, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v9, 5, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v7, v1, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v9, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v1, 7, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v8, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v1, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v12, 9, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v10, v13, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v12, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v13, 11, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v11, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v13, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v16, 13, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v14, v6, off sc0 sc1
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
-; UNALIGNED_GFX940-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v16, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    v_add_u32_e32 v6, 15, v0
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v15, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_store_byte v6, v3, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v0, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v4, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v2, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v5, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v7, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v9, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v8, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v1, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v10, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v12, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v11, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v13, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v14, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v16, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v3, v15, off sc0 sc1
+; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr11
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr4
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr15
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr10
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr7
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr13
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr5
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr14
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr12
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr8
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr2
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr1
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr9
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr16
+; UNALIGNED_GFX940-NEXT:    ; kill: killed $vgpr0
+; UNALIGNED_GFX940-NEXT:    scratch_load_ubyte v0, v6, off sc0 sc1
 ; UNALIGNED_GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; UNALIGNED_GFX11-LABEL: store_load_v4i32_unaligned:
 ; UNALIGNED_GFX11:       ; %bb.0: ; %bb
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; UNALIGNED_GFX11-NEXT:    s_mov_b32 s3, 4
-; UNALIGNED_GFX11-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX11-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX11-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
-; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; UNALIGNED_GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off dlc
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_add_nc_u32 v4, 1, v0
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v11, 3 :: v_dual_add_nc_u32 v6, 4, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v0, v1, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v1, 3, v0
+; UNALIGNED_GFX11-NEXT:    v_dual_mov_b32 v8, 4 :: v_dual_add_nc_u32 v5, 2, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v7, 5, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v4, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v5, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v1, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v9, 6, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v6, v2, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v7, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v2, 7, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v10, 8, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v12, 9, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v9, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v13, 10, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v2, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v10, v11, off dlc
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; UNALIGNED_GFX11-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v12, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v11, 11, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v14, 12, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v15, 13, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v16, 14, v0
+; UNALIGNED_GFX11-NEXT:    v_add_nc_u32_e32 v17, 15, v0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v13, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v11, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v14, v8, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v15, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v16, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_store_b8 v17, v3, off dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v0, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v4, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v5, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v1, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v6, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v7, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v9, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v2, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v10, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v12, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v13, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v11, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v14, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v15, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v16, off glc dlc
+; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
+; UNALIGNED_GFX11-NEXT:    scratch_load_u8 v0, v17, off glc dlc
 ; UNALIGNED_GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; UNALIGNED_GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2810,17 +3711,74 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; UNALIGNED_GFX12-NEXT:    s_wait_samplecnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_wait_kmcnt 0x0
-; UNALIGNED_GFX12-NEXT:    s_mov_b32 s3, 4
-; UNALIGNED_GFX12-NEXT:    s_mov_b32 s2, 3
-; UNALIGNED_GFX12-NEXT:    s_mov_b32 s1, 2
-; UNALIGNED_GFX12-NEXT:    s_mov_b32 s0, 1
-; UNALIGNED_GFX12-NEXT:    s_wait_alu 0xfffe
-; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
-; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; UNALIGNED_GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 0
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v3, 2
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v1, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
-; UNALIGNED_GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
-; UNALIGNED_GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v1, 3
+; UNALIGNED_GFX12-NEXT:    v_mov_b32_e32 v3, 4
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:8 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:9 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:10 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:11 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v3, off offset:12 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:13 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:14 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:15 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_storecnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:7 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:8 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:9 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:10 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:11 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:12 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:13 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:14 scope:SCOPE_SYS
+; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
+; UNALIGNED_GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:15 scope:SCOPE_SYS
 ; UNALIGNED_GFX12-NEXT:    s_wait_loadcnt 0x0
 ; UNALIGNED_GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
index 9bbc29107b9df6..63e2e0ef434eec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
@@ -497,8 +497,14 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s16_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -517,15 +523,27 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s16_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s16) = G_LOAD %0 :: (load (s16), align 1, addrspace 0)
     %2:_(s32) = G_ANYEXT %1
@@ -648,8 +666,14 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -668,15 +692,27 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s32_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_LOAD %0 :: (load (s32), align 2, addrspace 0)
     $vgpr0 = COPY %1
@@ -752,8 +788,24 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -782,15 +834,47 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s32_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_LOAD %0 :: (load (s32), align 1, addrspace 0)
     $vgpr0 = COPY %1
@@ -1121,8 +1205,26 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -1153,15 +1255,51 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s64_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 2, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -1269,8 +1407,44 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -1319,15 +1493,87 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s64_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -1681,8 +1927,29 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align2
@@ -1718,16 +1985,58 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s96_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 2, addrspace 0)
@@ -1870,8 +2179,55 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align1
@@ -1933,16 +2289,110 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s96_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 0)
@@ -2600,8 +3050,70 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s128_align1
@@ -2678,16 +3190,140 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_s128_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 0)
@@ -3086,8 +3722,45 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p1_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -3137,15 +3810,89 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p1_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p1) = G_LOAD %0 :: (load (p1), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -3431,8 +4178,27 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -3464,15 +4230,53 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p4_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p4) = G_LOAD %0 :: (load (p4), align 2, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -3580,8 +4384,45 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -3631,15 +4472,89 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p4_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p4) = G_LOAD %0 :: (load (p4), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -3763,8 +4678,15 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
+    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -3784,15 +4706,29 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p5_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p5) = G_LOAD %0 :: (load (p5), align 2, addrspace 0)
     $vgpr0 = COPY %1
@@ -3870,8 +4806,25 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
+    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -3901,15 +4854,49 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_p5_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(p5) = G_LOAD %0 :: (load (p5), align 1, addrspace 0)
     $vgpr0 = COPY %1
@@ -4101,8 +5088,14 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s8_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -4121,15 +5114,27 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s8_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s8>) = G_LOAD %0 :: (load (<2 x s8>), align 1, addrspace 0)
     %2:_(s16) = G_BITCAST %1
@@ -4553,36 +5558,41 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
     ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
-    ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
+    ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
+    ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
     ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
+    ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
     ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32)
+    ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s8_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -4628,71 +5638,81 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
     ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; UNALIGNED_GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; UNALIGNED_GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
     ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
     ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v3s8_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
     ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
-    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
+    ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
     ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
+    ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
     ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s8>) = G_LOAD %0 :: (load (<3 x s8>), align 1, addrspace 0)
     %2:_(s24) = G_BITCAST %1
@@ -4817,8 +5837,14 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -4837,15 +5863,27 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s8_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s8>) = G_LOAD %0 :: (load (<4 x s8>), align 2, addrspace 0)
     %2:_(s32) = G_BITCAST %1
@@ -4922,8 +5960,24 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -4952,15 +6006,47 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s8_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s8>) = G_LOAD %0 :: (load (<4 x s8>), align 1, addrspace 0)
     %2:_(s32) = G_BITCAST %1
@@ -5389,8 +6475,14 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -5409,15 +6501,27 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s16_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 2, addrspace 0)
     $vgpr0 = COPY %1
@@ -5501,8 +6605,24 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -5531,15 +6651,47 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s16_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 1, addrspace 0)
     $vgpr0 = COPY %1
@@ -6429,22 +7581,36 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1)
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1)
-    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
     ; GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -6501,22 +7667,36 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
     ; UNALIGNED_GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; UNALIGNED_GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
     ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -6530,22 +7710,36 @@ body: |
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1)
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1)
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
     ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
     ; UNALIGNED_GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; UNALIGNED_GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -6789,8 +7983,24 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -6819,15 +8029,47 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s16_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 2, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -6949,8 +8191,42 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -6997,15 +8273,83 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s16_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 1, addrspace 0)
     $vgpr0_vgpr1 = COPY %1
@@ -8460,8 +9804,45 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -8511,15 +9892,89 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s64_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 2, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -8689,8 +10144,79 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -8774,15 +10300,157 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2s64_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 1, addrspace 0)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -9326,14 +10994,114 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1)
-    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s64_align1
@@ -9454,28 +11222,228 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
     ; UNALIGNED_GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; UNALIGNED_GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
     ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v3s64_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1)
-    ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; UNALIGNED_GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; UNALIGNED_GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; UNALIGNED_GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; UNALIGNED_GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
     ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; UNALIGNED_GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<3 x s64>) = G_LOAD %0 :: (load (<3 x s64>), align 1, addrspace 0)
@@ -10048,11 +12016,147 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1)
-    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; GFX12-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
+    ; GFX12-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
+    ; GFX12-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
+    ; GFX12-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
+    ; GFX12-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
+    ; GFX12-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
+    ; GFX12-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
+    ; GFX12-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
+    ; GFX12-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
+    ; GFX12-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
+    ; GFX12-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
+    ; GFX12-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
+    ; GFX12-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
+    ; GFX12-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
+    ; GFX12-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
+    ; GFX12-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX12-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
+    ; GFX12-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
+    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s64_align1
@@ -10206,22 +12310,294 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
     ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v4s64_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1)
-    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
+    ; UNALIGNED_GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
+    ; UNALIGNED_GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
+    ; UNALIGNED_GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
+    ; UNALIGNED_GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
+    ; UNALIGNED_GFX12-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
+    ; UNALIGNED_GFX12-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
+    ; UNALIGNED_GFX12-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
+    ; UNALIGNED_GFX12-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<4 x s64>) = G_LOAD %0 :: (load (<4 x s64>), align 1, addrspace 0)
@@ -10818,8 +13194,70 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p1_align1
@@ -10896,16 +13334,140 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2p1_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p1>) = G_LOAD %0 :: (load (<2 x p1>), align 1, addrspace 0)
@@ -11182,8 +13744,40 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 1)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p3_align1
@@ -11230,16 +13824,80 @@ body: |
     ; UNALIGNED_GFX11PLUS: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX11PLUS-NEXT: {{  $}}
     ; UNALIGNED_GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 1)
-    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
     ; UNALIGNED_GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_flat_v2p3_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0_vgpr1
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 1)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
     %0:_(p0) = COPY $vgpr0_vgpr1
     %1:_(<2 x p3>) = G_LOAD %0 :: (load (<2 x p3>), align 1, addrspace 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
index 6ea0515924bf67..e8df205e697ef5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
@@ -657,8 +657,14 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s16_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -690,15 +696,27 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s16_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s16) = G_LOAD %0 :: (load (s16), align 1, addrspace 5)
     %2:_(s32) = G_ANYEXT %1
@@ -862,8 +880,14 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s32_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -895,15 +919,27 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s32_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s32) = G_LOAD %0 :: (load (s32), align 2, addrspace 5)
     $vgpr0 = COPY %1
@@ -1009,8 +1045,24 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s32_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -1062,15 +1114,47 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR2]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s32_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s32) = G_LOAD %0 :: (load (s32), align 1, addrspace 5)
     $vgpr0 = COPY %1
@@ -1512,14 +1596,20 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
     ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR1]](s32)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s24_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -1563,27 +1653,39 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
     ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR1]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s24_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
     ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR1]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(s24) = G_LOAD %0 :: (load (s24), align 1, addrspace 5)
     %2:_(s32) = G_ANYEXT %1
@@ -2108,8 +2210,26 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s64_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -2157,15 +2277,51 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s64_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
     %0:_(p5) = COPY $vgpr0
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 2, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -2327,8 +2483,44 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s64_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -2412,15 +2604,87 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s64_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
     %0:_(p5) = COPY $vgpr0
     %1:_(s64) = G_LOAD %0 :: (load (s64), align 1, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -2636,8 +2900,54 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s96_align16
@@ -2752,16 +3062,108 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align16
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p5) = COPY $vgpr0
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 5)
@@ -3197,8 +3599,29 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s96_align2
@@ -3263,16 +3686,58 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p5) = COPY $vgpr0
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 2, addrspace 5)
@@ -3489,8 +3954,54 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s96_align1
@@ -3605,16 +4116,108 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     %0:_(p5) = COPY $vgpr0
     %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 5)
@@ -3882,8 +4485,69 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s128_align16
@@ -4028,16 +4692,138 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align16
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p5) = COPY $vgpr0
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 5)
@@ -4542,8 +5328,36 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s128_align2
@@ -4622,16 +5436,72 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p5) = COPY $vgpr0
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 2, addrspace 5)
@@ -4899,8 +5769,69 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s128_align1
@@ -5045,16 +5976,138 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     %0:_(p5) = COPY $vgpr0
     %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 5)
@@ -5393,8 +6446,27 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_p1_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -5442,15 +6514,53 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_p1_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
     %0:_(p5) = COPY $vgpr0
     %1:_(p1) = G_LOAD %0 :: (load (p1), align 2, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -5612,8 +6722,45 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_p1_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -5697,15 +6844,89 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_p1_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
     %0:_(p5) = COPY $vgpr0
     %1:_(p1) = G_LOAD %0 :: (load (p1), align 1, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -5871,8 +7092,15 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
+    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_p3_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -5906,15 +7134,29 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_p3_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
     %0:_(p5) = COPY $vgpr0
     %1:_(p3) = G_LOAD %0 :: (load (p3), align 2, addrspace 5)
     $vgpr0 = COPY %1
@@ -6023,8 +7265,25 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
+    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_p3_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -6078,15 +7337,49 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_p3_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
     %0:_(p5) = COPY $vgpr0
     %1:_(p3) = G_LOAD %0 :: (load (p3), align 1, addrspace 5)
     $vgpr0 = COPY %1
@@ -6252,8 +7545,15 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
+    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_p5_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -6287,15 +7587,29 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_p5_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     %0:_(p5) = COPY $vgpr0
     %1:_(p5) = G_LOAD %0 :: (load (p5), align 2, addrspace 5)
     $vgpr0 = COPY %1
@@ -6404,8 +7718,25 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
+    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_p5_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -6459,15 +7790,49 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_p5_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
     %0:_(p5) = COPY $vgpr0
     %1:_(p5) = G_LOAD %0 :: (load (p5), align 1, addrspace 5)
     $vgpr0 = COPY %1
@@ -6647,10 +8012,15 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
     ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s8_align1
@@ -6687,20 +8057,30 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s8_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s8>) = G_LOAD %0 :: (load (<2 x s8>), align 1, addrspace 5)
@@ -7293,36 +8673,41 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
     ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
-    ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
+    ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
+    ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
     ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
+    ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
     ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32)
+    ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v3s8_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -7408,71 +8793,81 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
     ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; UNALIGNED_GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
-    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
-    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
+    ; UNALIGNED_GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
     ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
+    ; UNALIGNED_GFX11-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
     ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
-    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
-    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
-    ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR4]](s32)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s8_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
     ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
-    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
+    ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
     ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
+    ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
     ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
-    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32)
     %0:_(p5) = COPY $vgpr0
     %1:_(<3 x s8>) = G_LOAD %0 :: (load (<3 x s8>), align 1, addrspace 5)
     %2:_(s24) = G_BITCAST %1
@@ -7939,8 +9334,69 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v16s8_align16
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -8082,15 +9538,137 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v16s8_align16
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<16 x s8>) = G_LOAD %0 :: (load (<16 x s8>), align 1, addrspace 5)
     %2:_(<4 x s32>) = G_BITCAST %1
@@ -8266,8 +9844,14 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s16_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -8299,15 +9883,27 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s16_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 2, addrspace 5)
     $vgpr0 = COPY %1
@@ -8425,8 +10021,24 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
-    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s16_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -8478,15 +10090,47 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s16_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 1, addrspace 5)
     $vgpr0 = COPY %1
@@ -9392,22 +11036,36 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5)
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5)
-    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
     ; GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -9507,22 +11165,36 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
     ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; UNALIGNED_GFX11-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
     ; UNALIGNED_GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; UNALIGNED_GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -9536,22 +11208,36 @@ body: |
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
     ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
     ; UNALIGNED_GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; UNALIGNED_GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -9918,8 +11604,24 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s16_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -9969,15 +11671,47 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s16_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 2, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -10160,8 +11894,42 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s16_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -10247,15 +12015,83 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s16_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 1, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -10593,8 +12429,22 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s32_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -10642,15 +12492,43 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s32_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 2, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -10812,8 +12690,40 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s32_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -10897,15 +12807,79 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s32_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 1, addrspace 5)
     $vgpr0_vgpr1 = COPY %1
@@ -11115,8 +13089,54 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v3s32_align16
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -11228,15 +13248,107 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s32_align16
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 1, addrspace 5)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -11633,8 +13745,69 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s32_align16
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -11776,15 +13949,137 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align16
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 1, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -12260,8 +14555,36 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s32_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -12337,15 +14660,71 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align2
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 2, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -12606,8 +14985,69 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s32_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -12749,15 +15189,137 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align1
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 1, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -13893,8 +16455,78 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s64_align16
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -14038,15 +16670,155 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX11-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ;
     ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s64_align16
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
+    ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
+    ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
+    ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     %0:_(p5) = COPY $vgpr0
     %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 1, addrspace 5)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -16744,12 +19516,99 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 1, addrspace 5)
-    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5)
+    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]]
+    ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5)
+    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]]
+    ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]]
+    ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5)
+    ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]]
+    ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5)
+    ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]]
+    ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32)
+    ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]]
+    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32)
+    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
@@ -16963,12 +19822,99 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
-    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 1, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]]
+    ; UNALIGNED_GFX11-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; UNALIGNED_GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
@@ -16978,12 +19924,99 @@ body: |
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 1, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]]
+    ; UNALIGNED_GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
@@ -17234,12 +20267,50 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 2, addrspace 5)
-    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5)
+    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]]
+    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32)
+    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5)
+    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5)
+    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]]
+    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32)
+    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
@@ -17355,12 +20426,50 @@ body: |
     ; UNALIGNED_GFX11: liveins: $vgpr0
     ; UNALIGNED_GFX11-NEXT: {{  $}}
     ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
-    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 2, addrspace 5)
-    ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]]
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5)
+    ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; UNALIGNED_GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
@@ -17370,12 +20479,50 @@ body: |
     ; UNALIGNED_GFX12: liveins: $vgpr0
     ; UNALIGNED_GFX12-NEXT: {{  $}}
     ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
-    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 2, addrspace 5)
-    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
+    ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]]
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5)
+    ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]]
+    ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32)
+    ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
     ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index fd2d0bc901f3fc..3fc5d0d4b279eb 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -475,8 +475,14 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    scratch_store_short off, v0, s0 offset:4
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_load_dword v0, off, s0
-; FLATSCR-NEXT:    scratch_load_dword v1, off, s0 offset:2
+; FLATSCR-NEXT:    scratch_load_ushort v0, off, s0 offset:2
+; FLATSCR-NEXT:    scratch_load_ushort v3, off, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, v0
+; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, off, s0 offset:4
+; FLATSCR-NEXT:    s_mov_b32 s0, 0x5040100
+; FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; FLATSCR-NEXT:    v_perm_b32 v0, v0, v3, s0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; FLATSCR-NEXT:    s_endpgm
@@ -537,8 +543,13 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; FLATSCR_GFX10-NEXT:    scratch_store_short off, v0, s0 offset:4
 ; FLATSCR_GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; FLATSCR_GFX10-NEXT:    s_clause 0x1
-; FLATSCR_GFX10-NEXT:    scratch_load_dword v0, off, s0
-; FLATSCR_GFX10-NEXT:    scratch_load_dword v1, off, s0 offset:2
+; FLATSCR_GFX10-NEXT:    scratch_load_ushort v0, off, s0 offset:2
+; FLATSCR_GFX10-NEXT:    scratch_load_ushort v3, off, s0
+; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(1)
+; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR_GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v1, off, s0 offset:4
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR_GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; FLATSCR_GFX10-NEXT:    s_endpgm
@@ -561,8 +572,13 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i
 ; GFX11-NEXT:    scratch_store_b16 off, v0, off offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v0, off, off
-; GFX11-NEXT:    scratch_load_b32 v1, off, off offset:2
+; GFX11-NEXT:    scratch_load_u16 v0, off, off offset:2
+; GFX11-NEXT:    scratch_load_u16 v3, off, off
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, off, off offset:4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX11-NEXT:    s_nop 0
@@ -703,12 +719,12 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
 ; FLATSCR-LABEL: chain_hi_to_lo_private_other_dep:
 ; FLATSCR:       ; %bb.0: ; %bb
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR-NEXT:    scratch_load_dword v0, v0, off
-; FLATSCR-NEXT:    s_mov_b32 s0, 0x7060302
+; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, v0, off
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; FLATSCR-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
-; FLATSCR-NEXT:    v_perm_b32 v0, v1, v0, s0
+; FLATSCR-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_other_dep:
@@ -725,22 +741,23 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
 ; FLATSCR_GFX10-LABEL: chain_hi_to_lo_private_other_dep:
 ; FLATSCR_GFX10:       ; %bb.0: ; %bb
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR_GFX10-NEXT:    scratch_load_dword v0, v0, off
+; FLATSCR_GFX10-NEXT:    scratch_load_short_d16_hi v1, v0, off
 ; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
 ; FLATSCR_GFX10-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
-; FLATSCR_GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; FLATSCR_GFX10-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
+; FLATSCR_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR_GFX10-NEXT:    v_mov_b32_e32 v0, v1
 ; FLATSCR_GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: chain_hi_to_lo_private_other_dep:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_b32 v0, v0, off
+; GFX11-NEXT:    scratch_load_d16_hi_b16 v1, v0, off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_pk_add_u16 v1, v1, 12 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    scratch_load_d16_b16 v1, v0, off offset:2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds i16, ptr addrspace(5) %ptr, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
index 4e734d6e0884bc..fc33a274d7b11a 100644
--- a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -607,7 +607,14 @@ define amdgpu_kernel void @local_store_align1_v16i8(ptr addrspace(3) %out) #0 {
 ; MUBUF: buffer_load_ubyte
 ; MUBUF: buffer_load_ubyte
 ; MUBUF: buffer_load_ubyte
-; FLATSCR: scratch_load_dwordx2
+; FLATSCR: scratch_load_ubyte
+; FLATSCR: scratch_load_ubyte
+; FLATSCR: scratch_load_ubyte
+; FLATSCR: scratch_load_ubyte
+; FLATSCR: scratch_load_ubyte
+; FLATSCR: scratch_load_ubyte
+; FLATSCR: scratch_load_ubyte
+; FLATSCR: scratch_load_ubyte
 define double @private_load_align1_f64(ptr addrspace(5) %in) {
   %x = load double, ptr addrspace(5) %in, align 1
   ret double %x
@@ -622,7 +629,14 @@ define double @private_load_align1_f64(ptr addrspace(5) %in) {
 ; MUBUF: buffer_store_byte
 ; MUBUF: buffer_store_byte
 ; MUBUF: buffer_store_byte
-; FLATSCR: scratch_store_dwordx2
+; FLATSCR: scratch_store_byte
+; FLATSCR: scratch_store_byte
+; FLATSCR: scratch_store_byte
+; FLATSCR: scratch_store_byte
+; FLATSCR: scratch_store_byte
+; FLATSCR: scratch_store_byte
+; FLATSCR: scratch_store_byte
+; FLATSCR: scratch_store_byte
 define void @private_store_align1_f64(ptr addrspace(5) %out, double %x) #0 {
   store double %x, ptr addrspace(5) %out, align 1
   ret void
@@ -651,7 +665,10 @@ define void @private_store_align4_f64(ptr addrspace(5) %out, double %x) #0 {
 ; MUBUF: buffer_load_ushort
 ; MUBUF: buffer_load_ushort
 ; MUBUF: buffer_load_ushort
-; FLATSCR: scratch_load_dwordx2
+; FLATSCR: scratch_load_ushort
+; FLATSCR: scratch_load_ushort
+; FLATSCR: scratch_load_ushort
+; FLATSCR: scratch_load_ushort
 define double @private_load_align2_f64(ptr addrspace(5) %in) {
   %x = load double, ptr addrspace(5) %in, align 2
   ret double %x
@@ -662,7 +679,10 @@ define double @private_load_align2_f64(ptr addrspace(5) %in) {
 ; MUBUF: buffer_store_short
 ; MUBUF: buffer_store_short
 ; MUBUF: buffer_store_short
-; FLATSCR: scratch_store_dwordx2
+; FLATSCR: scratch_store_short
+; FLATSCR: scratch_store_short
+; FLATSCR: scratch_store_short
+; FLATSCR: scratch_store_short
 define void @private_store_align2_f64(ptr addrspace(5) %out, double %x) #0 {
   store double %x, ptr addrspace(5) %out, align 2
   ret void

>From aa8ba73592b1e204107512d48d96ec067531c709 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Mon, 7 Oct 2024 12:16:11 -0400
Subject: [PATCH 3/3] fixup! fixup! [AMDGPU] Enable unaligned scratch accesses

add FeatureUnalignedScratchAccess to gfx12
---
 llvm/lib/Target/AMDGPU/AMDGPU.td              |    6 +-
 .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll |  167 +--
 .../AMDGPU/GlobalISel/legalize-load-flat.mir  | 1072 ++------------
 .../GlobalISel/legalize-load-private.mir      | 1275 ++---------------
 4 files changed, 230 insertions(+), 2290 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 69569fbe95b267..62fac085897ab6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1246,9 +1246,9 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
    FeatureVOP3Literal, FeatureDPP8,
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureA16, FeatureFastDenormalF32, FeatureG16,
-   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
-   FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast,
-   FeatureMaxHardClauseLength32,
+   FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
+   FeatureUnalignedDSAccess, FeatureTrue16BitInsts,
+   FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32,
    FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
    FeatureAgentScopeFineGrainedRemoteMemoryAtomics
   ]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index f2ade4df0f42d2..6e2e88f22600a8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -2416,39 +2416,12 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v1, off scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:4 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS
+; GFX12-NEXT:    v_mov_b32_e32 v1, 15
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:7 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2766,57 +2739,16 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 0
-; GFX12-NEXT:    v_mov_b32_e32 v3, 2
+; GFX12-NEXT:    s_mov_b32 s2, 3
+; GFX12-NEXT:    s_mov_b32 s1, 2
+; GFX12-NEXT:    s_mov_b32 s0, 1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
+; GFX12-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v1, off scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, 3
-; GFX12-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:8 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:9 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:10 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:11 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:7 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:8 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:9 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:10 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:11 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3251,74 +3183,17 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
 ; GFX12-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 0
-; GFX12-NEXT:    v_mov_b32_e32 v3, 2
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v1, off scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v1, 3
-; GFX12-NEXT:    v_mov_b32_e32 v3, 4
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:8 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:9 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:10 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:11 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v3, off offset:12 scope:SCOPE_SYS
+; GFX12-NEXT:    s_mov_b32 s3, 4
+; GFX12-NEXT:    s_mov_b32 s2, 3
+; GFX12-NEXT:    s_mov_b32 s1, 2
+; GFX12-NEXT:    s_mov_b32 s0, 1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:13 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:14 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_store_b8 v0, v2, off offset:15 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:7 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:8 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:9 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:10 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:11 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:12 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:13 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v1, v0, off offset:14 scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:15 scope:SCOPE_SYS
+; GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
index 63e2e0ef434eec..8e299a19dfdbe5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
@@ -497,14 +497,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s16_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -666,14 +660,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -788,24 +776,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -1205,26 +1177,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -1407,44 +1361,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -1927,29 +1845,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align2
@@ -2179,55 +2076,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align1
@@ -3050,70 +2900,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s128_align1
@@ -3722,45 +3510,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p1_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -4178,27 +3929,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -4384,45 +4116,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -4678,15 +4373,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
-    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -4806,25 +4494,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
-    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -5088,14 +4759,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s8_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -5558,41 +5223,36 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
-    ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
     ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s8_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -5837,14 +5497,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -5960,24 +5614,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -6475,14 +6113,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -6605,24 +6237,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -7581,36 +7197,22 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1)
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1)
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1)
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
     ; GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -7983,24 +7585,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -8191,42 +7777,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -9804,45 +9356,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32)
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32)
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align2
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -10144,79 +9659,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align1
     ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1
@@ -10994,114 +10438,14 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1)
+    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>)
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
+    ; GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s64_align1
@@ -12016,147 +11360,11 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
-    ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16)
-    ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17)
-    ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]]
-    ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18)
-    ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19)
-    ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]]
-    ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]]
-    ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32)
-    ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20)
-    ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21)
-    ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]]
-    ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22)
-    ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23)
-    ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]]
-    ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]]
-    ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32)
-    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32)
-    ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]]
-    ; GFX12-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24)
-    ; GFX12-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25)
-    ; GFX12-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]]
-    ; GFX12-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26)
-    ; GFX12-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27)
-    ; GFX12-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]]
-    ; GFX12-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]]
-    ; GFX12-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32)
-    ; GFX12-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28)
-    ; GFX12-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29)
-    ; GFX12-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]]
-    ; GFX12-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30)
-    ; GFX12-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31)
-    ; GFX12-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]]
-    ; GFX12-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]]
-    ; GFX12-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32)
-    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32)
-    ; GFX12-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
-    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64)
-    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1)
+    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s64_align1
@@ -13194,70 +12402,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13)
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p1_align1
@@ -13744,40 +12890,8 @@ body: |
     ; GFX12: liveins: $vgpr0_vgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8))
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load (<2 x s32>), align 1)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p3>) = G_BITCAST [[LOAD]](<2 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BITCAST]](<2 x p3>)
     ;
     ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p3_align1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
index e8df205e697ef5..8835956e11a4e2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
@@ -657,14 +657,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s16_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -880,14 +874,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s32_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -1045,24 +1033,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s32_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -1596,20 +1568,14 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s24_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -2210,26 +2176,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s64_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -2483,44 +2431,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s64_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -2900,54 +2812,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s96_align16
@@ -3599,29 +3465,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s96_align2
@@ -3954,54 +3799,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s96_align1
@@ -4485,69 +4284,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s128_align16
@@ -5328,36 +5066,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s128_align2
@@ -5769,69 +5479,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_s128_align1
@@ -6446,27 +6095,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_p1_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -6722,45 +6352,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_p1_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -7092,15 +6685,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
-    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_p3_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -7265,25 +6851,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
-    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_p3_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -7545,15 +7114,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
-    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_p5_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -7718,25 +7280,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
-    ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_p5_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -8012,15 +7557,10 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
+    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32)
     ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s8_align1
@@ -8673,41 +8213,36 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
-    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
-    ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
+    ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]]
+    ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16)
+    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]]
     ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]]
+    ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]]
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]]
+    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
+    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v3s8_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -9334,69 +8869,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v16s8_align16
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -9844,14 +9318,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s16_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -10021,24 +9489,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s16_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -11036,36 +10488,22 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5)
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5)
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
+    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
+    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5)
+    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
     ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
     ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32)
+    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32)
     ; GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
@@ -11604,24 +11042,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s16_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -11894,42 +11316,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32)
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
-    ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s16_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -12429,22 +11817,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s32_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -12690,40 +12064,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
-    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s32_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -13089,54 +12431,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v3s32_align16
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -13745,69 +13041,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s32_align16
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -14555,36 +13790,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s32_align2
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -14985,69 +14192,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v4s32_align1
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -16455,78 +15601,8 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]]
-    ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32)
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]]
-    ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32)
-    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32)
-    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
-    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5)
+    ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ;
     ; UNALIGNED_GFX9-LABEL: name: test_load_private_v2s64_align16
     ; UNALIGNED_GFX9: liveins: $vgpr0
@@ -19516,99 +18592,12 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5)
-    ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5)
-    ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]]
-    ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5)
-    ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]]
-    ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5)
-    ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]]
-    ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
-    ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5)
-    ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]]
-    ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5)
-    ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]]
-    ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]]
-    ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5)
-    ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]]
-    ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5)
-    ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]]
-    ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32)
-    ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]]
-    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32)
-    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 1, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
@@ -20267,50 +19256,12 @@ body: |
     ; GFX12: liveins: $vgpr0
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
-    ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5)
-    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
+    ; GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
     ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5)
-    ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
-    ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5)
-    ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
-    ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5)
-    ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]]
-    ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
-    ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5)
-    ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
-    ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5)
-    ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]]
-    ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32)
-    ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5)
-    ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32)
-    ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5)
-    ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32)
-    ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]]
-    ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32)
-    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>)
+    ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 2, addrspace 5)
+    ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>)
     ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96)
     ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)



More information about the llvm-commits mailing list