[llvm] [AMDGPU] Enable more consecutive load folding during aggressive-instcombine (PR #158036)

via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 12 05:17:54 PDT 2025


https://github.com/macurtis-amd updated https://github.com/llvm/llvm-project/pull/158036

>From e7f25f7b34f8312cc2ebcd443994beddca9fcb11 Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Fri, 27 Jun 2025 13:05:11 -0500
Subject: [PATCH 1/5] AMDGPU: Report unaligned scratch access as fast if
 supported by tgt

This enables more consecutive load folding during aggressive-instcombine.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   8 +-
 .../CodeGen/AMDGPU/fold-consecutive-loads.ll  | 457 ++++++++++++++
 .../test/CodeGen/AMDGPU/memcpy-fixed-align.ll |  16 +-
 llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll    | 152 +++--
 .../AMDGPU/memcpy-param-combinations.ll       | 558 ++++++++----------
 .../AMDGPU/memmove-param-combinations.ll      |  64 +-
 6 files changed, 815 insertions(+), 440 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fold-consecutive-loads.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index edce4856f77b0..f8e5740880708 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2090,10 +2090,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
       AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
     bool AlignedBy4 = Alignment >= Align(4);
+    if (Subtarget->hasUnalignedScratchAccessEnabled()) {
+      if (IsFast)
+        *IsFast = AlignedBy4 ? Size : 1;
+      return true;
+    }
+
     if (IsFast)
       *IsFast = AlignedBy4;
 
-    return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
+    return AlignedBy4;
   }
 
   // So long as they are correct, wide global memory operations perform better
diff --git a/llvm/test/CodeGen/AMDGPU/fold-consecutive-loads.ll b/llvm/test/CodeGen/AMDGPU/fold-consecutive-loads.ll
new file mode 100644
index 0000000000000..610760f788ea8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-consecutive-loads.ll
@@ -0,0 +1,457 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=sroa,instcombine,aggressive-instcombine %s -S -o - | FileCheck %s
+
+define i64 @quux(ptr %arg) {
+; CHECK-LABEL: define i64 @quux(
+; CHECK-SAME: ptr [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[ARG]], align 1
+; CHECK-NEXT:    ret i64 [[LOAD]]
+;
+bb:
+  %load = load i8, ptr %arg, align 1
+  %getelementptr = getelementptr inbounds nuw i8, ptr %arg, i64 1
+  %load1 = load i8, ptr %getelementptr, align 1
+  %getelementptr2 = getelementptr inbounds nuw i8, ptr %arg, i64 2
+  %load3 = load i8, ptr %getelementptr2, align 1
+  %getelementptr4 = getelementptr inbounds nuw i8, ptr %arg, i64 3
+  %load5 = load i8, ptr %getelementptr4, align 1
+  %getelementptr6 = getelementptr inbounds nuw i8, ptr %arg, i64 4
+  %load7 = load i8, ptr %getelementptr6, align 1
+  %getelementptr8 = getelementptr inbounds nuw i8, ptr %arg, i64 5
+  %load9 = load i8, ptr %getelementptr8, align 1
+  %getelementptr10 = getelementptr inbounds nuw i8, ptr %arg, i64 6
+  %load11 = load i8, ptr %getelementptr10, align 1
+  %getelementptr12 = getelementptr inbounds nuw i8, ptr %arg, i64 7
+  %load13 = load i8, ptr %getelementptr12, align 1
+  %zext = zext i8 %load13 to i64
+  %shl = shl nuw i64 %zext, 56
+  %zext14 = zext i8 %load11 to i64
+  %shl15 = shl nuw nsw i64 %zext14, 48
+  %or = or disjoint i64 %shl, %shl15
+  %zext16 = zext i8 %load9 to i64
+  %shl17 = shl nuw nsw i64 %zext16, 40
+  %or18 = or disjoint i64 %or, %shl17
+  %zext19 = zext i8 %load7 to i64
+  %shl20 = shl nuw nsw i64 %zext19, 32
+  %or21 = or disjoint i64 %or18, %shl20
+  %zext22 = zext i8 %load5 to i64
+  %shl23 = shl nuw nsw i64 %zext22, 24
+  %or24 = or disjoint i64 %or21, %shl23
+  %zext25 = zext i8 %load3 to i64
+  %shl26 = shl nuw nsw i64 %zext25, 16
+  %zext27 = zext i8 %load1 to i64
+  %shl28 = shl nuw nsw i64 %zext27, 8
+  %or29 = or disjoint i64 %or24, %shl26
+  %zext30 = zext i8 %load to i64
+  %or31 = or i64 %or29, %shl28
+  %or32 = or i64 %or31, %zext30
+  ret i64 %or32
+}
+
+
+; The following test case reduced from a client kernel
+%struct.eggs = type { i8 }
+%struct.pluto = type { %struct.spam }
+%struct.spam = type { <32 x i8> }
+%struct.snork = type { i8 }
+%struct.quux = type { ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr }
+%struct.bar = type { ptr, ptr, ptr, ptr, ptr, ptr }
+
+define fastcc void @hoge(ptr noundef nonnull readonly align 8 captures(none) dereferenceable(48) %arg) {
+; CHECK-LABEL: define fastcc void @hoge(
+; CHECK-SAME: ptr noundef nonnull readonly align 8 captures(none) dereferenceable(48) [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[ARG]], align 8
+; CHECK-NEXT:    [[GETELEMENTPTR13:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG]], i64 16
+; CHECK-NEXT:    [[LOAD14:%.*]] = load ptr, ptr [[GETELEMENTPTR13]], align 8
+; CHECK-NEXT:    [[LOAD28:%.*]] = load i64, ptr [[LOAD]], align 1
+; CHECK-NEXT:    [[LOAD29:%.*]] = load i64, ptr [[LOAD14]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR72:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD]], i64 8
+; CHECK-NEXT:    [[LOAD73:%.*]] = load i64, ptr [[GETELEMENTPTR72]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR75:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 8
+; CHECK-NEXT:    [[LOAD76:%.*]] = load i64, ptr [[GETELEMENTPTR75]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR120:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD]], i64 16
+; CHECK-NEXT:    [[LOAD121:%.*]] = load i64, ptr [[GETELEMENTPTR120]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR123:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 16
+; CHECK-NEXT:    [[LOAD124:%.*]] = load i64, ptr [[GETELEMENTPTR123]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR168:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD]], i64 24
+; CHECK-NEXT:    [[LOAD169:%.*]] = load i64, ptr [[GETELEMENTPTR168]], align 1
+; CHECK-NEXT:    [[GETELEMENTPTR171:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 24
+; CHECK-NEXT:    [[LOAD172:%.*]] = load i32, ptr [[GETELEMENTPTR171]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[LOAD172]] to i64
+; CHECK-NEXT:    [[GETELEMENTPTR195:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 28
+; CHECK-NEXT:    [[LOAD196:%.*]] = load i8, ptr [[GETELEMENTPTR195]], align 1
+; CHECK-NEXT:    [[ALLOCA2_SROA_30_28_INSERT_EXT:%.*]] = zext i8 [[LOAD196]] to i64
+; CHECK-NEXT:    [[ALLOCA2_SROA_30_28_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[ALLOCA2_SROA_30_28_INSERT_EXT]], 32
+; CHECK-NEXT:    [[GETELEMENTPTR201:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 29
+; CHECK-NEXT:    [[LOAD202:%.*]] = load i8, ptr [[GETELEMENTPTR201]], align 1
+; CHECK-NEXT:    [[ALLOCA2_SROA_30_29_INSERT_EXT:%.*]] = zext i8 [[LOAD202]] to i64
+; CHECK-NEXT:    [[ALLOCA2_SROA_30_29_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[ALLOCA2_SROA_30_29_INSERT_EXT]], 40
+; CHECK-NEXT:    [[ALLOCA2_SROA_30_29_INSERT_MASK:%.*]] = or disjoint i64 [[TMP0]], [[ALLOCA2_SROA_30_28_INSERT_SHIFT]]
+; CHECK-NEXT:    [[GETELEMENTPTR207:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 30
+; CHECK-NEXT:    [[LOAD208:%.*]] = load i8, ptr [[GETELEMENTPTR207]], align 1
+; CHECK-NEXT:    [[ALLOCA2_SROA_30_30_INSERT_EXT:%.*]] = zext i8 [[LOAD208]] to i64
+; CHECK-NEXT:    [[ALLOCA2_SROA_30_30_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[ALLOCA2_SROA_30_30_INSERT_EXT]], 48
+; CHECK-NEXT:    [[GETELEMENTPTR213:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 31
+; CHECK-NEXT:    [[LOAD214:%.*]] = load i8, ptr [[GETELEMENTPTR213]], align 1
+; CHECK-NEXT:    [[ALLOCA2_SROA_30_31_INSERT_EXT:%.*]] = zext i8 [[LOAD214]] to i64
+; CHECK-NEXT:    [[ALLOCA2_SROA_30_31_INSERT_SHIFT:%.*]] = shl nuw i64 [[ALLOCA2_SROA_30_31_INSERT_EXT]], 56
+; CHECK-NEXT:    [[ALLOCA2_SROA_30_30_INSERT_MASK_MASKED:%.*]] = or i64 [[ALLOCA2_SROA_30_29_INSERT_MASK]], [[ALLOCA2_SROA_30_29_INSERT_SHIFT]]
+; CHECK-NEXT:    [[ALLOCA2_SROA_30_31_INSERT_MASK:%.*]] = or i64 [[ALLOCA2_SROA_30_30_INSERT_MASK_MASKED]], [[ALLOCA2_SROA_30_30_INSERT_SHIFT]]
+; CHECK-NEXT:    [[ALLOCA2_SROA_30_31_INSERT_INSERT:%.*]] = or i64 [[ALLOCA2_SROA_30_31_INSERT_MASK]], [[ALLOCA2_SROA_30_31_INSERT_SHIFT]]
+; CHECK-NEXT:    [[GETELEMENTPTR216:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG]], i64 40
+; CHECK-NEXT:    [[LOAD217:%.*]] = load ptr, ptr [[GETELEMENTPTR216]], align 8
+; CHECK-NEXT:    [[LOAD220:%.*]] = load <16 x float>, ptr [[LOAD217]], align 64
+; CHECK-NEXT:    [[CALL:%.*]] = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD28]], i64 [[LOAD29]], <16 x float> [[LOAD220]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    store <16 x float> [[CALL]], ptr [[LOAD217]], align 64
+; CHECK-NEXT:    [[CALL225:%.*]] = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD73]], i64 [[LOAD76]], <16 x float> [[CALL]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    store <16 x float> [[CALL225]], ptr [[LOAD217]], align 64
+; CHECK-NEXT:    [[CALL230:%.*]] = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD121]], i64 [[LOAD124]], <16 x float> [[CALL225]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[CALL235:%.*]] = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD169]], i64 [[ALLOCA2_SROA_30_31_INSERT_INSERT]], <16 x float> [[CALL230]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    store <16 x float> [[CALL235]], ptr [[LOAD217]], align 64
+; CHECK-NEXT:    ret void
+;
+bb:
+  %alloca = alloca %struct.eggs, align 1, addrspace(5)
+  %alloca1 = alloca %struct.pluto, align 32, addrspace(5)
+  %alloca2 = alloca %struct.pluto, align 32, addrspace(5)
+  %alloca3 = alloca %struct.snork, align 1, addrspace(5)
+  %alloca4 = alloca %struct.quux, align 8, addrspace(5)
+  %addrspacecast = addrspacecast ptr addrspace(5) %alloca to ptr
+  %addrspacecast5 = addrspacecast ptr addrspace(5) %alloca1 to ptr
+  %addrspacecast6 = addrspacecast ptr addrspace(5) %alloca2 to ptr
+  call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) %alloca1)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) align 32 %alloca1, i8 0, i64 32, i1 false)
+  call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) %alloca2)
+  call void @llvm.memset.p5.i64(ptr addrspace(5) align 32 %alloca2, i8 0, i64 32, i1 false)
+  call void @llvm.lifetime.start.p5(i64 1, ptr addrspace(5) %alloca3)
+  store ptr %addrspacecast5, ptr addrspace(5) %alloca4, align 8
+  %getelementptr = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 1
+  %load = load ptr, ptr %arg, align 8
+  store ptr %load, ptr addrspace(5) %getelementptr, align 8
+  %getelementptr7 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 2
+  %getelementptr8 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 1
+  %load9 = load ptr, ptr %getelementptr8, align 8
+  store ptr %load9, ptr addrspace(5) %getelementptr7, align 8
+  %getelementptr10 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 3
+  store ptr %addrspacecast, ptr addrspace(5) %getelementptr10, align 8
+  %getelementptr11 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 4
+  store ptr %addrspacecast6, ptr addrspace(5) %getelementptr11, align 8
+  %getelementptr12 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 5
+  %getelementptr13 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 2
+  %load14 = load ptr, ptr %getelementptr13, align 8
+  store ptr %load14, ptr addrspace(5) %getelementptr12, align 8
+  %getelementptr15 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 6
+  %getelementptr16 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 3
+  %load17 = load ptr, ptr %getelementptr16, align 8
+  store ptr %load17, ptr addrspace(5) %getelementptr15, align 8
+  %getelementptr18 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 7
+  %getelementptr19 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 4
+  %load20 = load ptr, ptr %getelementptr19, align 8
+  store ptr %load20, ptr addrspace(5) %getelementptr18, align 8
+  %load21 = load ptr, ptr addrspace(5) %alloca4, align 8
+  %getelementptr22 = getelementptr inbounds i8, ptr addrspace(5) %alloca4, i32 8
+  %load23 = load ptr, ptr addrspace(5) %getelementptr22, align 8
+  %getelementptr24 = getelementptr inbounds i8, ptr addrspace(5) %alloca4, i32 32
+  %load25 = load ptr, ptr addrspace(5) %getelementptr24, align 8
+  %getelementptr26 = getelementptr inbounds i8, ptr addrspace(5) %alloca4, i32 40
+  %load27 = load ptr, ptr addrspace(5) %getelementptr26, align 8
+  %load28 = load i8, ptr %load23, align 1
+  store i8 %load28, ptr %load21, align 1
+  %load29 = load i8, ptr %load27, align 1
+  store i8 %load29, ptr %load25, align 1
+  %getelementptr30 = getelementptr inbounds i8, ptr %load23, i64 1
+  %load31 = load i8, ptr %getelementptr30, align 1
+  %getelementptr32 = getelementptr inbounds i8, ptr %load21, i64 1
+  store i8 %load31, ptr %getelementptr32, align 1
+  %getelementptr33 = getelementptr inbounds i8, ptr %load27, i64 1
+  %load34 = load i8, ptr %getelementptr33, align 1
+  %getelementptr35 = getelementptr inbounds i8, ptr %load25, i64 1
+  store i8 %load34, ptr %getelementptr35, align 1
+  %getelementptr36 = getelementptr inbounds i8, ptr %load23, i64 2
+  %load37 = load i8, ptr %getelementptr36, align 1
+  %getelementptr38 = getelementptr inbounds i8, ptr %load21, i64 2
+  store i8 %load37, ptr %getelementptr38, align 1
+  %getelementptr39 = getelementptr inbounds i8, ptr %load27, i64 2
+  %load40 = load i8, ptr %getelementptr39, align 1
+  %getelementptr41 = getelementptr inbounds i8, ptr %load25, i64 2
+  store i8 %load40, ptr %getelementptr41, align 1
+  %getelementptr42 = getelementptr inbounds i8, ptr %load23, i64 3
+  %load43 = load i8, ptr %getelementptr42, align 1
+  %getelementptr44 = getelementptr inbounds i8, ptr %load21, i64 3
+  store i8 %load43, ptr %getelementptr44, align 1
+  %getelementptr45 = getelementptr inbounds i8, ptr %load27, i64 3
+  %load46 = load i8, ptr %getelementptr45, align 1
+  %getelementptr47 = getelementptr inbounds i8, ptr %load25, i64 3
+  store i8 %load46, ptr %getelementptr47, align 1
+  %getelementptr48 = getelementptr inbounds i8, ptr %load23, i64 4
+  %load49 = load i8, ptr %getelementptr48, align 1
+  %getelementptr50 = getelementptr inbounds i8, ptr %load21, i64 4
+  store i8 %load49, ptr %getelementptr50, align 1
+  %getelementptr51 = getelementptr inbounds i8, ptr %load27, i64 4
+  %load52 = load i8, ptr %getelementptr51, align 1
+  %getelementptr53 = getelementptr inbounds i8, ptr %load25, i64 4
+  store i8 %load52, ptr %getelementptr53, align 1
+  %getelementptr54 = getelementptr inbounds i8, ptr %load23, i64 5
+  %load55 = load i8, ptr %getelementptr54, align 1
+  %getelementptr56 = getelementptr inbounds i8, ptr %load21, i64 5
+  store i8 %load55, ptr %getelementptr56, align 1
+  %getelementptr57 = getelementptr inbounds i8, ptr %load27, i64 5
+  %load58 = load i8, ptr %getelementptr57, align 1
+  %getelementptr59 = getelementptr inbounds i8, ptr %load25, i64 5
+  store i8 %load58, ptr %getelementptr59, align 1
+  %getelementptr60 = getelementptr inbounds i8, ptr %load23, i64 6
+  %load61 = load i8, ptr %getelementptr60, align 1
+  %getelementptr62 = getelementptr inbounds i8, ptr %load21, i64 6
+  store i8 %load61, ptr %getelementptr62, align 1
+  %getelementptr63 = getelementptr inbounds i8, ptr %load27, i64 6
+  %load64 = load i8, ptr %getelementptr63, align 1
+  %getelementptr65 = getelementptr inbounds i8, ptr %load25, i64 6
+  store i8 %load64, ptr %getelementptr65, align 1
+  %getelementptr66 = getelementptr inbounds i8, ptr %load23, i64 7
+  %load67 = load i8, ptr %getelementptr66, align 1
+  %getelementptr68 = getelementptr inbounds i8, ptr %load21, i64 7
+  store i8 %load67, ptr %getelementptr68, align 1
+  %getelementptr69 = getelementptr inbounds i8, ptr %load27, i64 7
+  %load70 = load i8, ptr %getelementptr69, align 1
+  %getelementptr71 = getelementptr inbounds i8, ptr %load25, i64 7
+  store i8 %load70, ptr %getelementptr71, align 1
+  %getelementptr72 = getelementptr inbounds i8, ptr %load23, i64 8
+  %load73 = load i8, ptr %getelementptr72, align 1
+  %getelementptr74 = getelementptr inbounds i8, ptr %load21, i64 8
+  store i8 %load73, ptr %getelementptr74, align 1
+  %getelementptr75 = getelementptr inbounds i8, ptr %load27, i64 8
+  %load76 = load i8, ptr %getelementptr75, align 1
+  %getelementptr77 = getelementptr inbounds i8, ptr %load25, i64 8
+  store i8 %load76, ptr %getelementptr77, align 1
+  %getelementptr78 = getelementptr inbounds i8, ptr %load23, i64 9
+  %load79 = load i8, ptr %getelementptr78, align 1
+  %getelementptr80 = getelementptr inbounds i8, ptr %load21, i64 9
+  store i8 %load79, ptr %getelementptr80, align 1
+  %getelementptr81 = getelementptr inbounds i8, ptr %load27, i64 9
+  %load82 = load i8, ptr %getelementptr81, align 1
+  %getelementptr83 = getelementptr inbounds i8, ptr %load25, i64 9
+  store i8 %load82, ptr %getelementptr83, align 1
+  %getelementptr84 = getelementptr inbounds i8, ptr %load23, i64 10
+  %load85 = load i8, ptr %getelementptr84, align 1
+  %getelementptr86 = getelementptr inbounds i8, ptr %load21, i64 10
+  store i8 %load85, ptr %getelementptr86, align 1
+  %getelementptr87 = getelementptr inbounds i8, ptr %load27, i64 10
+  %load88 = load i8, ptr %getelementptr87, align 1
+  %getelementptr89 = getelementptr inbounds i8, ptr %load25, i64 10
+  store i8 %load88, ptr %getelementptr89, align 1
+  %getelementptr90 = getelementptr inbounds i8, ptr %load23, i64 11
+  %load91 = load i8, ptr %getelementptr90, align 1
+  %getelementptr92 = getelementptr inbounds i8, ptr %load21, i64 11
+  store i8 %load91, ptr %getelementptr92, align 1
+  %getelementptr93 = getelementptr inbounds i8, ptr %load27, i64 11
+  %load94 = load i8, ptr %getelementptr93, align 1
+  %getelementptr95 = getelementptr inbounds i8, ptr %load25, i64 11
+  store i8 %load94, ptr %getelementptr95, align 1
+  %getelementptr96 = getelementptr inbounds i8, ptr %load23, i64 12
+  %load97 = load i8, ptr %getelementptr96, align 1
+  %getelementptr98 = getelementptr inbounds i8, ptr %load21, i64 12
+  store i8 %load97, ptr %getelementptr98, align 1
+  %getelementptr99 = getelementptr inbounds i8, ptr %load27, i64 12
+  %load100 = load i8, ptr %getelementptr99, align 1
+  %getelementptr101 = getelementptr inbounds i8, ptr %load25, i64 12
+  store i8 %load100, ptr %getelementptr101, align 1
+  %getelementptr102 = getelementptr inbounds i8, ptr %load23, i64 13
+  %load103 = load i8, ptr %getelementptr102, align 1
+  %getelementptr104 = getelementptr inbounds i8, ptr %load21, i64 13
+  store i8 %load103, ptr %getelementptr104, align 1
+  %getelementptr105 = getelementptr inbounds i8, ptr %load27, i64 13
+  %load106 = load i8, ptr %getelementptr105, align 1
+  %getelementptr107 = getelementptr inbounds i8, ptr %load25, i64 13
+  store i8 %load106, ptr %getelementptr107, align 1
+  %getelementptr108 = getelementptr inbounds i8, ptr %load23, i64 14
+  %load109 = load i8, ptr %getelementptr108, align 1
+  %getelementptr110 = getelementptr inbounds i8, ptr %load21, i64 14
+  store i8 %load109, ptr %getelementptr110, align 1
+  %getelementptr111 = getelementptr inbounds i8, ptr %load27, i64 14
+  %load112 = load i8, ptr %getelementptr111, align 1
+  %getelementptr113 = getelementptr inbounds i8, ptr %load25, i64 14
+  store i8 %load112, ptr %getelementptr113, align 1
+  %getelementptr114 = getelementptr inbounds i8, ptr %load23, i64 15
+  %load115 = load i8, ptr %getelementptr114, align 1
+  %getelementptr116 = getelementptr inbounds i8, ptr %load21, i64 15
+  store i8 %load115, ptr %getelementptr116, align 1
+  %getelementptr117 = getelementptr inbounds i8, ptr %load27, i64 15
+  %load118 = load i8, ptr %getelementptr117, align 1
+  %getelementptr119 = getelementptr inbounds i8, ptr %load25, i64 15
+  store i8 %load118, ptr %getelementptr119, align 1
+  %getelementptr120 = getelementptr inbounds i8, ptr %load23, i64 16
+  %load121 = load i8, ptr %getelementptr120, align 1
+  %getelementptr122 = getelementptr inbounds i8, ptr %load21, i64 16
+  store i8 %load121, ptr %getelementptr122, align 1
+  %getelementptr123 = getelementptr inbounds i8, ptr %load27, i64 16
+  %load124 = load i8, ptr %getelementptr123, align 1
+  %getelementptr125 = getelementptr inbounds i8, ptr %load25, i64 16
+  store i8 %load124, ptr %getelementptr125, align 1
+  %getelementptr126 = getelementptr inbounds i8, ptr %load23, i64 17
+  %load127 = load i8, ptr %getelementptr126, align 1
+  %getelementptr128 = getelementptr inbounds i8, ptr %load21, i64 17
+  store i8 %load127, ptr %getelementptr128, align 1
+  %getelementptr129 = getelementptr inbounds i8, ptr %load27, i64 17
+  %load130 = load i8, ptr %getelementptr129, align 1
+  %getelementptr131 = getelementptr inbounds i8, ptr %load25, i64 17
+  store i8 %load130, ptr %getelementptr131, align 1
+  %getelementptr132 = getelementptr inbounds i8, ptr %load23, i64 18
+  %load133 = load i8, ptr %getelementptr132, align 1
+  %getelementptr134 = getelementptr inbounds i8, ptr %load21, i64 18
+  store i8 %load133, ptr %getelementptr134, align 1
+  %getelementptr135 = getelementptr inbounds i8, ptr %load27, i64 18
+  %load136 = load i8, ptr %getelementptr135, align 1
+  %getelementptr137 = getelementptr inbounds i8, ptr %load25, i64 18
+  store i8 %load136, ptr %getelementptr137, align 1
+  %getelementptr138 = getelementptr inbounds i8, ptr %load23, i64 19
+  %load139 = load i8, ptr %getelementptr138, align 1
+  %getelementptr140 = getelementptr inbounds i8, ptr %load21, i64 19
+  store i8 %load139, ptr %getelementptr140, align 1
+  %getelementptr141 = getelementptr inbounds i8, ptr %load27, i64 19
+  %load142 = load i8, ptr %getelementptr141, align 1
+  %getelementptr143 = getelementptr inbounds i8, ptr %load25, i64 19
+  store i8 %load142, ptr %getelementptr143, align 1
+  %getelementptr144 = getelementptr inbounds i8, ptr %load23, i64 20
+  %load145 = load i8, ptr %getelementptr144, align 1
+  %getelementptr146 = getelementptr inbounds i8, ptr %load21, i64 20
+  store i8 %load145, ptr %getelementptr146, align 1
+  %getelementptr147 = getelementptr inbounds i8, ptr %load27, i64 20
+  %load148 = load i8, ptr %getelementptr147, align 1
+  %getelementptr149 = getelementptr inbounds i8, ptr %load25, i64 20
+  store i8 %load148, ptr %getelementptr149, align 1
+  %getelementptr150 = getelementptr inbounds i8, ptr %load23, i64 21
+  %load151 = load i8, ptr %getelementptr150, align 1
+  %getelementptr152 = getelementptr inbounds i8, ptr %load21, i64 21
+  store i8 %load151, ptr %getelementptr152, align 1
+  %getelementptr153 = getelementptr inbounds i8, ptr %load27, i64 21
+  %load154 = load i8, ptr %getelementptr153, align 1
+  %getelementptr155 = getelementptr inbounds i8, ptr %load25, i64 21
+  store i8 %load154, ptr %getelementptr155, align 1
+  %getelementptr156 = getelementptr inbounds i8, ptr %load23, i64 22
+  %load157 = load i8, ptr %getelementptr156, align 1
+  %getelementptr158 = getelementptr inbounds i8, ptr %load21, i64 22
+  store i8 %load157, ptr %getelementptr158, align 1
+  %getelementptr159 = getelementptr inbounds i8, ptr %load27, i64 22
+  %load160 = load i8, ptr %getelementptr159, align 1
+  %getelementptr161 = getelementptr inbounds i8, ptr %load25, i64 22
+  store i8 %load160, ptr %getelementptr161, align 1
+  %getelementptr162 = getelementptr inbounds i8, ptr %load23, i64 23
+  %load163 = load i8, ptr %getelementptr162, align 1
+  %getelementptr164 = getelementptr inbounds i8, ptr %load21, i64 23
+  store i8 %load163, ptr %getelementptr164, align 1
+  %getelementptr165 = getelementptr inbounds i8, ptr %load27, i64 23
+  %load166 = load i8, ptr %getelementptr165, align 1
+  %getelementptr167 = getelementptr inbounds i8, ptr %load25, i64 23
+  store i8 %load166, ptr %getelementptr167, align 1
+  %getelementptr168 = getelementptr inbounds i8, ptr %load23, i64 24
+  %load169 = load i8, ptr %getelementptr168, align 1
+  %getelementptr170 = getelementptr inbounds i8, ptr %load21, i64 24
+  store i8 %load169, ptr %getelementptr170, align 1
+  %getelementptr171 = getelementptr inbounds i8, ptr %load27, i64 24
+  %load172 = load i8, ptr %getelementptr171, align 1
+  %getelementptr173 = getelementptr inbounds i8, ptr %load25, i64 24
+  store i8 %load172, ptr %getelementptr173, align 1
+  %getelementptr174 = getelementptr inbounds i8, ptr %load23, i64 25
+  %load175 = load i8, ptr %getelementptr174, align 1
+  %getelementptr176 = getelementptr inbounds i8, ptr %load21, i64 25
+  store i8 %load175, ptr %getelementptr176, align 1
+  %getelementptr177 = getelementptr inbounds i8, ptr %load27, i64 25
+  %load178 = load i8, ptr %getelementptr177, align 1
+  %getelementptr179 = getelementptr inbounds i8, ptr %load25, i64 25
+  store i8 %load178, ptr %getelementptr179, align 1
+  %getelementptr180 = getelementptr inbounds i8, ptr %load23, i64 26
+  %load181 = load i8, ptr %getelementptr180, align 1
+  %getelementptr182 = getelementptr inbounds i8, ptr %load21, i64 26
+  store i8 %load181, ptr %getelementptr182, align 1
+  %getelementptr183 = getelementptr inbounds i8, ptr %load27, i64 26
+  %load184 = load i8, ptr %getelementptr183, align 1
+  %getelementptr185 = getelementptr inbounds i8, ptr %load25, i64 26
+  store i8 %load184, ptr %getelementptr185, align 1
+  %getelementptr186 = getelementptr inbounds i8, ptr %load23, i64 27
+  %load187 = load i8, ptr %getelementptr186, align 1
+  %getelementptr188 = getelementptr inbounds i8, ptr %load21, i64 27
+  store i8 %load187, ptr %getelementptr188, align 1
+  %getelementptr189 = getelementptr inbounds i8, ptr %load27, i64 27
+  %load190 = load i8, ptr %getelementptr189, align 1
+  %getelementptr191 = getelementptr inbounds i8, ptr %load25, i64 27
+  store i8 %load190, ptr %getelementptr191, align 1
+  %getelementptr192 = getelementptr inbounds i8, ptr %load23, i64 28
+  %load193 = load i8, ptr %getelementptr192, align 1
+  %getelementptr194 = getelementptr inbounds i8, ptr %load21, i64 28
+  store i8 %load193, ptr %getelementptr194, align 1
+  %getelementptr195 = getelementptr inbounds i8, ptr %load27, i64 28
+  %load196 = load i8, ptr %getelementptr195, align 1
+  %getelementptr197 = getelementptr inbounds i8, ptr %load25, i64 28
+  store i8 %load196, ptr %getelementptr197, align 1
+  %getelementptr198 = getelementptr inbounds i8, ptr %load23, i64 29
+  %load199 = load i8, ptr %getelementptr198, align 1
+  %getelementptr200 = getelementptr inbounds i8, ptr %load21, i64 29
+  store i8 %load199, ptr %getelementptr200, align 1
+  %getelementptr201 = getelementptr inbounds i8, ptr %load27, i64 29
+  %load202 = load i8, ptr %getelementptr201, align 1
+  %getelementptr203 = getelementptr inbounds i8, ptr %load25, i64 29
+  store i8 %load202, ptr %getelementptr203, align 1
+  %getelementptr204 = getelementptr inbounds i8, ptr %load23, i64 30
+  %load205 = load i8, ptr %getelementptr204, align 1
+  %getelementptr206 = getelementptr inbounds i8, ptr %load21, i64 30
+  store i8 %load205, ptr %getelementptr206, align 1
+  %getelementptr207 = getelementptr inbounds i8, ptr %load27, i64 30
+  %load208 = load i8, ptr %getelementptr207, align 1
+  %getelementptr209 = getelementptr inbounds i8, ptr %load25, i64 30
+  store i8 %load208, ptr %getelementptr209, align 1
+  %getelementptr210 = getelementptr inbounds i8, ptr %load23, i64 31
+  %load211 = load i8, ptr %getelementptr210, align 1
+  %getelementptr212 = getelementptr inbounds i8, ptr %load21, i64 31
+  store i8 %load211, ptr %getelementptr212, align 1
+  %getelementptr213 = getelementptr inbounds i8, ptr %load27, i64 31
+  %load214 = load i8, ptr %getelementptr213, align 1
+  %getelementptr215 = getelementptr inbounds i8, ptr %load25, i64 31
+  store i8 %load214, ptr %getelementptr215, align 1
+  call void @llvm.lifetime.end.p5(i64 1, ptr addrspace(5) %alloca3)
+  %getelementptr216 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 5
+  %load217 = load ptr, ptr %getelementptr216, align 8
+  %load218 = load i64, ptr addrspace(5) %alloca1, align 8
+  %load219 = load i64, ptr addrspace(5) %alloca2, align 8
+  %load220 = load <16 x float>, ptr %load217, align 64
+  %call = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %load218, i64 %load219, <16 x float> %load220, i32 0, i32 0, i32 0)
+  store <16 x float> %call, ptr %load217, align 64
+  %getelementptr221 = getelementptr inbounds i8, ptr addrspace(5) %alloca1, i64 8
+  %getelementptr222 = getelementptr inbounds i8, ptr addrspace(5) %alloca2, i64 8
+  %load223 = load i64, ptr addrspace(5) %getelementptr221, align 8
+  %load224 = load i64, ptr addrspace(5) %getelementptr222, align 8
+  %call225 = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %load223, i64 %load224, <16 x float> %call, i32 0, i32 0, i32 0)
+  store <16 x float> %call225, ptr %load217, align 64
+  %getelementptr226 = getelementptr inbounds i8, ptr addrspace(5) %alloca1, i64 16
+  %getelementptr227 = getelementptr inbounds i8, ptr addrspace(5) %alloca2, i64 16
+  %load228 = load i64, ptr addrspace(5) %getelementptr226, align 8
+  %load229 = load i64, ptr addrspace(5) %getelementptr227, align 8
+  %call230 = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %load228, i64 %load229, <16 x float> %call225, i32 0, i32 0, i32 0)
+  store <16 x float> %call230, ptr %load217, align 64
+  %getelementptr231 = getelementptr inbounds i8, ptr addrspace(5) %alloca1, i64 24
+  %getelementptr232 = getelementptr inbounds i8, ptr addrspace(5) %alloca2, i64 24
+  %load233 = load i64, ptr addrspace(5) %getelementptr231, align 8
+  %load234 = load i64, ptr addrspace(5) %getelementptr232, align 8
+  %call235 = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %load233, i64 %load234, <16 x float> %call230, i32 0, i32 0, i32 0)
+  store <16 x float> %call235, ptr %load217, align 64
+  call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) %alloca2)
+  call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) %alloca1)
+  ret void
+}
+
+; Function Attrs: convergent nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64, i64, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #0
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) captures(none)) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) captures(none)) #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p5.i64(ptr addrspace(5) writeonly captures(none), i8, i64, i1 immarg) #2
+
+attributes #0 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
index 37a261cab7563..e8bd640aa5409 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
@@ -7,23 +7,25 @@ define void @memcpy_fixed_align(ptr addrspace(5)  %dst, ptr addrspace(1) %src) {
 ; MUBUF-LABEL: memcpy_fixed_align:
 ; MUBUF:       ; %bb.0:
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MUBUF-NEXT:    global_load_dwordx2 v[11:12], v[1:2], off offset:32
 ; MUBUF-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
 ; MUBUF-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; MUBUF-NEXT:    global_load_dwordx4 v[11:14], v[1:2], off offset:24
 ; MUBUF-NEXT:    s_lshr_b32 s4, s32, 6
 ; MUBUF-NEXT:    s_waitcnt vmcnt(2)
-; MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32
-; MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:36
-; MUBUF-NEXT:    s_waitcnt vmcnt(3)
 ; MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:12
 ; MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:8
 ; MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:4
 ; MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], s32
-; MUBUF-NEXT:    s_waitcnt vmcnt(6)
+; MUBUF-NEXT:    s_waitcnt vmcnt(5)
 ; MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:28
 ; MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:24
 ; MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:20
 ; MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:16
+; MUBUF-NEXT:    s_waitcnt vmcnt(8)
+; MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:36
+; MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:32
+; MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:28
+; MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:24
 ; MUBUF-NEXT:    ;;#ASMSTART
 ; MUBUF-NEXT:    ; use s4
 ; MUBUF-NEXT:    ;;#ASMEND
@@ -35,14 +37,14 @@ define void @memcpy_fixed_align(ptr addrspace(5)  %dst, ptr addrspace(1) %src) {
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
 ; FLATSCR-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off offset:16
-; FLATSCR-NEXT:    global_load_dwordx2 v[11:12], v[1:2], off offset:32
+; FLATSCR-NEXT:    global_load_dwordx4 v[11:14], v[1:2], off offset:24
 ; FLATSCR-NEXT:    s_mov_b32 s0, s32
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(2)
 ; FLATSCR-NEXT:    scratch_store_dwordx4 off, v[3:6], s32
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(2)
 ; FLATSCR-NEXT:    scratch_store_dwordx4 off, v[7:10], s32 offset:16
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(2)
-; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[11:12], s32 offset:32
+; FLATSCR-NEXT:    scratch_store_dwordx4 off, v[11:14], s32 offset:24
 ; FLATSCR-NEXT:    ;;#ASMSTART
 ; FLATSCR-NEXT:    ; use s0
 ; FLATSCR-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 0003366f3a3ea..5b7c36559a366 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -12,21 +12,19 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v12, s3
-; CHECK-NEXT:    v_mov_b32_e32 v11, s2
-; CHECK-NEXT:    flat_load_ubyte v13, v[11:12] offset:46
-; CHECK-NEXT:    flat_load_ushort v14, v[11:12] offset:44
-; CHECK-NEXT:    flat_load_dwordx3 v[8:10], v[11:12] offset:32
-; CHECK-NEXT:    flat_load_dwordx4 v[0:3], v[11:12] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[11:12]
-; CHECK-NEXT:    v_mov_b32_e32 v12, s1
-; CHECK-NEXT:    v_mov_b32_e32 v11, s0
+; CHECK-NEXT:    v_mov_b32_e32 v9, s3
+; CHECK-NEXT:    v_mov_b32_e32 v8, s2
+; CHECK-NEXT:    flat_load_dwordx2 v[10:11], v[8:9] offset:32
+; CHECK-NEXT:    flat_load_dwordx2 v[12:13], v[8:9] offset:39
+; CHECK-NEXT:    flat_load_dwordx4 v[0:3], v[8:9]
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[8:9] offset:16
+; CHECK-NEXT:    v_mov_b32_e32 v9, s1
+; CHECK-NEXT:    v_mov_b32_e32 v8, s0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[11:12], v13 offset:46
-; CHECK-NEXT:    flat_store_short v[11:12], v14 offset:44
-; CHECK-NEXT:    flat_store_dwordx3 v[11:12], v[8:10] offset:32
-; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[0:3] offset:16
-; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
+; CHECK-NEXT:    flat_store_dwordx2 v[8:9], v[10:11] offset:32
+; CHECK-NEXT:    flat_store_dwordx2 v[8:9], v[12:13] offset:39
+; CHECK-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; CHECK-NEXT:    flat_store_dwordx4 v[8:9], v[4:7] offset:16
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -173,33 +171,33 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    v_mov_b32_e32 v26, s0
 ; CHECK-NEXT:    buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
 ; CHECK-NEXT:    buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
-; CHECK-NEXT:    buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
-; CHECK-NEXT:    buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
 ; CHECK-NEXT:    buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
 ; CHECK-NEXT:    buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
 ; CHECK-NEXT:    buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
 ; CHECK-NEXT:    buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT:    buffer_load_dword v8, v26, s[20:23], 0 offen offset:32
-; CHECK-NEXT:    buffer_load_dword v9, v26, s[20:23], 0 offen offset:36
-; CHECK-NEXT:    buffer_load_dword v10, v26, s[20:23], 0 offen offset:40
-; CHECK-NEXT:    buffer_load_dword v11, v26, s[20:23], 0 offen offset:44
-; CHECK-NEXT:    buffer_load_dword v12, v26, s[20:23], 0 offen offset:48
-; CHECK-NEXT:    buffer_load_dword v13, v26, s[20:23], 0 offen offset:52
-; CHECK-NEXT:    buffer_load_dword v14, v26, s[20:23], 0 offen offset:56
-; CHECK-NEXT:    buffer_load_dword v15, v26, s[20:23], 0 offen offset:60
-; CHECK-NEXT:    buffer_load_dword v17, v26, s[20:23], 0 offen offset:68
-; CHECK-NEXT:    buffer_load_dword v19, v26, s[20:23], 0 offen offset:76
-; CHECK-NEXT:    buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
-; CHECK-NEXT:    buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
-; CHECK-NEXT:    buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
-; CHECK-NEXT:    buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
-; CHECK-NEXT:    buffer_load_dword v18, v26, s[20:23], 0 offen offset:72
-; CHECK-NEXT:    buffer_load_dword v16, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT:    buffer_load_dword v11, v26, s[20:23], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v10, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v9, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v8, v26, s[20:23], 0 offen offset:80
+; CHECK-NEXT:    buffer_load_dword v15, v26, s[20:23], 0 offen offset:76
+; CHECK-NEXT:    buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT:    buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v19, v26, s[20:23], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v20, v26, s[20:23], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v21, v26, s[20:23], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v22, v26, s[20:23], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v23, v26, s[20:23], 0 offen offset:60
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v25, s1
 ; CHECK-NEXT:    v_mov_b32_e32 v24, s0
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:96
@@ -213,10 +211,10 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[20:23] offset:80
-; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[16:19] offset:64
-; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[12:15] offset:48
-; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[12:15] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[20:23] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[16:19] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:16
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
@@ -281,8 +279,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
 ; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:32
 ; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
 ; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
-; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
-; CHECK-NEXT:    ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; CHECK-NEXT:    ds_read_b128 v[8:11], v16 offset:96
+; CHECK-NEXT:    ds_read_b128 v[16:19], v16 offset:112
 ; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[12:15] offset:48
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3] offset:64
@@ -302,21 +300,19 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v12, s3
-; CHECK-NEXT:    v_mov_b32_e32 v11, s2
-; CHECK-NEXT:    flat_load_ubyte v13, v[11:12] offset:46
-; CHECK-NEXT:    flat_load_ushort v14, v[11:12] offset:44
-; CHECK-NEXT:    flat_load_dwordx3 v[8:10], v[11:12] offset:32
-; CHECK-NEXT:    flat_load_dwordx4 v[0:3], v[11:12] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[11:12]
-; CHECK-NEXT:    v_mov_b32_e32 v12, s1
-; CHECK-NEXT:    v_mov_b32_e32 v11, s0
+; CHECK-NEXT:    v_mov_b32_e32 v9, s3
+; CHECK-NEXT:    v_mov_b32_e32 v8, s2
+; CHECK-NEXT:    flat_load_dwordx2 v[10:11], v[8:9] offset:32
+; CHECK-NEXT:    flat_load_dwordx2 v[12:13], v[8:9] offset:39
+; CHECK-NEXT:    flat_load_dwordx4 v[0:3], v[8:9]
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[8:9] offset:16
+; CHECK-NEXT:    v_mov_b32_e32 v9, s1
+; CHECK-NEXT:    v_mov_b32_e32 v8, s0
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[11:12], v13 offset:46
-; CHECK-NEXT:    flat_store_short v[11:12], v14 offset:44
-; CHECK-NEXT:    flat_store_dwordx3 v[11:12], v[8:10] offset:32
-; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[0:3] offset:16
-; CHECK-NEXT:    flat_store_dwordx4 v[11:12], v[4:7]
+; CHECK-NEXT:    flat_store_dwordx2 v[8:9], v[10:11] offset:32
+; CHECK-NEXT:    flat_store_dwordx2 v[8:9], v[12:13] offset:39
+; CHECK-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; CHECK-NEXT:    flat_store_dwordx4 v[8:9], v[4:7] offset:16
 ; CHECK-NEXT:    s_endpgm
 entry:
   tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -463,33 +459,33 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    v_mov_b32_e32 v26, s0
 ; CHECK-NEXT:    buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
 ; CHECK-NEXT:    buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
-; CHECK-NEXT:    buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
-; CHECK-NEXT:    buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
 ; CHECK-NEXT:    buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
 ; CHECK-NEXT:    buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
 ; CHECK-NEXT:    buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
 ; CHECK-NEXT:    buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT:    buffer_load_dword v8, v26, s[20:23], 0 offen offset:32
-; CHECK-NEXT:    buffer_load_dword v9, v26, s[20:23], 0 offen offset:36
-; CHECK-NEXT:    buffer_load_dword v10, v26, s[20:23], 0 offen offset:40
-; CHECK-NEXT:    buffer_load_dword v11, v26, s[20:23], 0 offen offset:44
-; CHECK-NEXT:    buffer_load_dword v12, v26, s[20:23], 0 offen offset:48
-; CHECK-NEXT:    buffer_load_dword v13, v26, s[20:23], 0 offen offset:52
-; CHECK-NEXT:    buffer_load_dword v14, v26, s[20:23], 0 offen offset:56
-; CHECK-NEXT:    buffer_load_dword v15, v26, s[20:23], 0 offen offset:60
-; CHECK-NEXT:    buffer_load_dword v17, v26, s[20:23], 0 offen offset:68
-; CHECK-NEXT:    buffer_load_dword v19, v26, s[20:23], 0 offen offset:76
-; CHECK-NEXT:    buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
-; CHECK-NEXT:    buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
-; CHECK-NEXT:    buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
-; CHECK-NEXT:    buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
-; CHECK-NEXT:    buffer_load_dword v18, v26, s[20:23], 0 offen offset:72
-; CHECK-NEXT:    buffer_load_dword v16, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT:    buffer_load_dword v11, v26, s[20:23], 0 offen offset:92
+; CHECK-NEXT:    buffer_load_dword v10, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v9, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v8, v26, s[20:23], 0 offen offset:80
+; CHECK-NEXT:    buffer_load_dword v15, v26, s[20:23], 0 offen offset:76
+; CHECK-NEXT:    buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT:    buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v19, v26, s[20:23], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v20, v26, s[20:23], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v21, v26, s[20:23], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v22, v26, s[20:23], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v23, v26, s[20:23], 0 offen offset:60
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v25, s1
 ; CHECK-NEXT:    v_mov_b32_e32 v24, s0
-; CHECK-NEXT:    s_waitcnt vmcnt(18)
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3] offset:112
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:96
@@ -503,10 +499,10 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
 ; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[20:23] offset:80
-; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[16:19] offset:64
-; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[12:15] offset:48
-; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[8:11] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[12:15] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[20:23] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[16:19] offset:32
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[4:7] offset:16
 ; CHECK-NEXT:    flat_store_dwordx4 v[24:25], v[0:3]
@@ -571,8 +567,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
 ; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[8:11] offset:32
 ; CHECK-NEXT:    ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
 ; CHECK-NEXT:    ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
-; CHECK-NEXT:    ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
-; CHECK-NEXT:    ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; CHECK-NEXT:    ds_read_b128 v[8:11], v16 offset:96
+; CHECK-NEXT:    ds_read_b128 v[16:19], v16 offset:112
 ; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[12:15] offset:48
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[20:21], v[0:3] offset:64
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
index b43ccc551ca95..048610184368d 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
@@ -27,19 +27,16 @@ define void @memcpy_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -83,19 +80,16 @@ define void @memcpy_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v9, v[2:3] offset:30
-; CHECK-NEXT:    flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT:    flat_load_dwordx3 v[6:8], v[2:3] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -239,19 +233,16 @@ define void @memcpy_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -295,19 +286,16 @@ define void @memcpy_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT:    global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT:    global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9] offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -437,7 +425,7 @@ define void @memcpy_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[2:5], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -451,19 +439,15 @@ define void @memcpy_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    ds_read_b64 v[7:8], v2 offset:23
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2
+; CHECK-NEXT:    ds_read_b64 v[9:10], v2 offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[7:8] offset:23
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[9:10] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -475,8 +459,8 @@ define void @memcpy_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
-; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT:    ds_read_b128 v[7:10], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
@@ -492,7 +476,7 @@ define void @memcpy_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[2:5], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -506,19 +490,15 @@ define void @memcpy_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b32 v8, v2 offset:24
-; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    ds_read_b64 v[7:8], v2 offset:23
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2
+; CHECK-NEXT:    ds_read_b64 v[9:10], v2 offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[7:8] offset:23
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[9:10] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -530,8 +510,8 @@ define void @memcpy_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p3_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
-; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT:    ds_read_b128 v[7:10], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
@@ -643,12 +623,9 @@ define void @memcpy_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:8
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:8
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -660,24 +637,16 @@ define void @memcpy_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
-; CHECK-NEXT:    global_load_dword v4, v[2:3], off offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dword v[0:1], v4 offset:24
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:28
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:30
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:8
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:30
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -689,18 +658,13 @@ define void @memcpy_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
-; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:24
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:24
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -712,12 +676,9 @@ define void @memcpy_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:8
+; CHECK-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:8
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -729,24 +690,16 @@ define void @memcpy_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
-; CHECK-NEXT:    global_load_dword v4, v[2:3], off offset:24
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dword v[0:1], v4 offset:24
-; CHECK-NEXT:    global_load_ushort v4, v[2:3], off offset:28
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx2 v[8:9], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[8:9]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT:    global_load_ubyte v2, v[2:3], off offset:30
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7] offset:8
+; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:30
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -758,18 +711,13 @@ define void @memcpy_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p4_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:8
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:8
-; CHECK-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[4:5] offset:16
-; CHECK-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off offset:24
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v[2:3], off offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[2:3] offset:24
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[8:11] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -895,22 +843,20 @@ define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[7:8] offset:23
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[9:10] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -964,22 +910,20 @@ define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p0_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    s_clause 0x7
 ; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
 ; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    flat_store_short v[0:1], v11 offset:28
-; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[7:9] offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[7:8] offset:23
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx2 v[0:1], v[9:10] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -1161,15 +1105,15 @@ define void @memcpy_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x2
-; CHECK-NEXT:    flat_load_dwordx2 v[6:7], v[2:3] offset:23
-; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:23
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -1211,15 +1155,15 @@ define void @memcpy_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x2
-; CHECK-NEXT:    flat_load_dwordx2 v[6:7], v[2:3] offset:23
-; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:16
-; CHECK-NEXT:    flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT:    flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT:    flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT:    flat_load_dwordx2 v[2:3], v[2:3] offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:23
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[8:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off offset:16
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -1929,18 +1873,18 @@ define void @memcpy_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[9:10], off offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1994,18 +1938,18 @@ define void @memcpy_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x7
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    s_waitcnt vmcnt(6)
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[9:10], off offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -3267,19 +3211,16 @@ define void @memcpy_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[7:8], v[1:2] offset:16
 ; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
@@ -3334,19 +3275,16 @@ define void @memcpy_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p0_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    flat_load_ubyte v8, v[1:2] offset:30
-; CHECK-NEXT:    flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT:    flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT:    flat_load_dwordx2 v[7:8], v[1:2] offset:16
 ; CHECK-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT:    s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
 ; CHECK-NEXT:    s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
@@ -3525,24 +3463,21 @@ define void @memcpy_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -3592,24 +3527,21 @@ define void @memcpy_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p1_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -3783,25 +3715,20 @@ define void @memcpy_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
 ; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
 ; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    ds_read_b64 v[8:9], v1 offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -3850,25 +3777,20 @@ define void @memcpy_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p3_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read_b32 v8, v1 offset:24
-; CHECK-NEXT:    ds_read_u16 v9, v1 offset:28
-; CHECK-NEXT:    ds_read_u8 v10, v1 offset:30
 ; CHECK-NEXT:    ds_read2_b64 v[2:5], v1 offset1:1
 ; CHECK-NEXT:    ds_read_b64 v[6:7], v1 offset:16
-; CHECK-NEXT:    s_waitcnt lgkmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
-; CHECK-NEXT:    buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    ds_read_b64 v[8:9], v1 offset:23
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(2)
-; CHECK-NEXT:    buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4037,24 +3959,21 @@ define void @memcpy_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -4104,24 +4023,21 @@ define void @memcpy_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p4_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x3
-; CHECK-NEXT:    global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT:    global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT:    global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT:    global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x2
+; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT:    global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT:    global_load_dwordx2 v[1:2], v[1:2], off offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4302,34 +4218,31 @@ define void @memcpy_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz31_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -4398,34 +4311,31 @@ define void @memcpy_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
 ; CHECK-LABEL: memcpy_p5_p5_sz31_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    s_clause 0x7
+; CHECK-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT:    buffer_load_dword v3, v1, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    buffer_load_dword v8, v1, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    s_waitcnt vmcnt(8)
-; CHECK-NEXT:    buffer_store_short v2, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_waitcnt vmcnt(7)
-; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:23
 ; CHECK-NEXT:    s_waitcnt vmcnt(6)
-; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:27
 ; CHECK-NEXT:    s_waitcnt vmcnt(5)
-; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
 ; CHECK-NEXT:    s_waitcnt vmcnt(4)
-; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
-; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
 ; CHECK-NEXT:    s_waitcnt vmcnt(2)
-; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
index f08ea27040fb5..01b7f40f6256f 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
@@ -471,7 +471,7 @@ define void @memmove_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz16_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[2:5], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -489,7 +489,7 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
 ; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
 ; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[2:5], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
@@ -509,8 +509,8 @@ define void @memmove_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz32_align_1_1:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
-; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT:    ds_read_b128 v[7:10], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
@@ -526,7 +526,7 @@ define void @memmove_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz16_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[2:5], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -544,7 +544,7 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-NEXT:    ds_read_u8 v9, v2 offset:30
 ; CHECK-NEXT:    ds_read_u16 v10, v2 offset:28
 ; CHECK-NEXT:    ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT:    ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[2:5], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
 ; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:30
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(3)
@@ -564,8 +564,8 @@ define void @memmove_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
 ; CHECK-LABEL: memmove_p0_p3_sz32_align_2_2:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
-; CHECK-NEXT:    ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT:    ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT:    ds_read_b128 v[7:10], v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
@@ -2077,21 +2077,23 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    global_store_dword v[0:1], v10, off offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
 ; CHECK-NEXT:    global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT:    global_store_byte v[0:1], v10, off offset:30
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -2143,21 +2145,23 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_clause 0x8
-; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
 ; CHECK-NEXT:    buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    global_store_dword v[0:1], v10, off offset:24
 ; CHECK-NEXT:    s_waitcnt vmcnt(3)
 ; CHECK-NEXT:    global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT:    global_store_byte v[0:1], v10, off offset:30
-; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT:    global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT:    global_store_dwordx2 v[0:1], v[7:8], off
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT:    global_store_dwordx4 v[0:1], v[3:6], off offset:8
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
 entry:
   tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)

>From 9c0c36e439672c98764d44687de88d15ffd353cc Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Fri, 12 Sep 2025 07:14:25 -0500
Subject: [PATCH 2/5] fixup! AMDGPU: Report unaligned scratch access as fast if
 supported by tgt

---
 .../AggressiveInstCombine}/fold-consecutive-loads.ll              | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/{CodeGen/AMDGPU => Transforms/AggressiveInstCombine}/fold-consecutive-loads.ll (100%)

diff --git a/llvm/test/CodeGen/AMDGPU/fold-consecutive-loads.ll b/llvm/test/Transforms/AggressiveInstCombine/fold-consecutive-loads.ll
similarity index 100%
rename from llvm/test/CodeGen/AMDGPU/fold-consecutive-loads.ll
rename to llvm/test/Transforms/AggressiveInstCombine/fold-consecutive-loads.ll

>From d3c536a47725b579ca22236f8bdf41e1573ae290 Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Fri, 12 Sep 2025 07:14:25 -0500
Subject: [PATCH 3/5] fixup! AMDGPU: Report unaligned scratch access as fast if
 supported by tgt

---
 .../fold-consecutive-loads.ll                 | 495 +++++-------------
 1 file changed, 138 insertions(+), 357 deletions(-)

diff --git a/llvm/test/Transforms/AggressiveInstCombine/fold-consecutive-loads.ll b/llvm/test/Transforms/AggressiveInstCombine/fold-consecutive-loads.ll
index 610760f788ea8..8fe242598ef0c 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/fold-consecutive-loads.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/fold-consecutive-loads.ll
@@ -51,407 +51,188 @@ bb:
 
 
 ; The following test case reduced from a client kernel
-%struct.eggs = type { i8 }
-%struct.pluto = type { %struct.spam }
-%struct.spam = type { <32 x i8> }
-%struct.snork = type { i8 }
-%struct.quux = type { ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr }
-%struct.bar = type { ptr, ptr, ptr, ptr, ptr, ptr }
-
-define fastcc void @hoge(ptr noundef nonnull readonly align 8 captures(none) dereferenceable(48) %arg) {
-; CHECK-LABEL: define fastcc void @hoge(
-; CHECK-SAME: ptr noundef nonnull readonly align 8 captures(none) dereferenceable(48) [[ARG:%.*]]) #[[ATTR0]] {
+define fastcc <16 x float> @hoge(ptr %arg) {
+; CHECK-LABEL: define fastcc <16 x float> @hoge(
+; CHECK-SAME: ptr [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[BB:.*:]]
 ; CHECK-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[ARG]], align 8
-; CHECK-NEXT:    [[GETELEMENTPTR13:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG]], i64 16
-; CHECK-NEXT:    [[LOAD14:%.*]] = load ptr, ptr [[GETELEMENTPTR13]], align 8
 ; CHECK-NEXT:    [[LOAD28:%.*]] = load i64, ptr [[LOAD]], align 1
-; CHECK-NEXT:    [[LOAD29:%.*]] = load i64, ptr [[LOAD14]], align 1
-; CHECK-NEXT:    [[GETELEMENTPTR72:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD]], i64 8
+; CHECK-NEXT:    [[GETELEMENTPTR72:%.*]] = getelementptr i8, ptr [[LOAD]], i64 8
 ; CHECK-NEXT:    [[LOAD73:%.*]] = load i64, ptr [[GETELEMENTPTR72]], align 1
-; CHECK-NEXT:    [[GETELEMENTPTR75:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 8
-; CHECK-NEXT:    [[LOAD76:%.*]] = load i64, ptr [[GETELEMENTPTR75]], align 1
-; CHECK-NEXT:    [[GETELEMENTPTR120:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD]], i64 16
+; CHECK-NEXT:    [[GETELEMENTPTR120:%.*]] = getelementptr i8, ptr [[LOAD]], i64 16
 ; CHECK-NEXT:    [[LOAD121:%.*]] = load i64, ptr [[GETELEMENTPTR120]], align 1
-; CHECK-NEXT:    [[GETELEMENTPTR123:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 16
-; CHECK-NEXT:    [[LOAD124:%.*]] = load i64, ptr [[GETELEMENTPTR123]], align 1
-; CHECK-NEXT:    [[GETELEMENTPTR168:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD]], i64 24
+; CHECK-NEXT:    [[GETELEMENTPTR168:%.*]] = getelementptr i8, ptr [[LOAD]], i64 24
 ; CHECK-NEXT:    [[LOAD169:%.*]] = load i64, ptr [[GETELEMENTPTR168]], align 1
-; CHECK-NEXT:    [[GETELEMENTPTR171:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 24
-; CHECK-NEXT:    [[LOAD172:%.*]] = load i32, ptr [[GETELEMENTPTR171]], align 1
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[LOAD172]] to i64
-; CHECK-NEXT:    [[GETELEMENTPTR195:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 28
-; CHECK-NEXT:    [[LOAD196:%.*]] = load i8, ptr [[GETELEMENTPTR195]], align 1
-; CHECK-NEXT:    [[ALLOCA2_SROA_30_28_INSERT_EXT:%.*]] = zext i8 [[LOAD196]] to i64
-; CHECK-NEXT:    [[ALLOCA2_SROA_30_28_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[ALLOCA2_SROA_30_28_INSERT_EXT]], 32
-; CHECK-NEXT:    [[GETELEMENTPTR201:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 29
-; CHECK-NEXT:    [[LOAD202:%.*]] = load i8, ptr [[GETELEMENTPTR201]], align 1
-; CHECK-NEXT:    [[ALLOCA2_SROA_30_29_INSERT_EXT:%.*]] = zext i8 [[LOAD202]] to i64
-; CHECK-NEXT:    [[ALLOCA2_SROA_30_29_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[ALLOCA2_SROA_30_29_INSERT_EXT]], 40
-; CHECK-NEXT:    [[ALLOCA2_SROA_30_29_INSERT_MASK:%.*]] = or disjoint i64 [[TMP0]], [[ALLOCA2_SROA_30_28_INSERT_SHIFT]]
-; CHECK-NEXT:    [[GETELEMENTPTR207:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 30
-; CHECK-NEXT:    [[LOAD208:%.*]] = load i8, ptr [[GETELEMENTPTR207]], align 1
-; CHECK-NEXT:    [[ALLOCA2_SROA_30_30_INSERT_EXT:%.*]] = zext i8 [[LOAD208]] to i64
-; CHECK-NEXT:    [[ALLOCA2_SROA_30_30_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[ALLOCA2_SROA_30_30_INSERT_EXT]], 48
-; CHECK-NEXT:    [[GETELEMENTPTR213:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 31
-; CHECK-NEXT:    [[LOAD214:%.*]] = load i8, ptr [[GETELEMENTPTR213]], align 1
-; CHECK-NEXT:    [[ALLOCA2_SROA_30_31_INSERT_EXT:%.*]] = zext i8 [[LOAD214]] to i64
-; CHECK-NEXT:    [[ALLOCA2_SROA_30_31_INSERT_SHIFT:%.*]] = shl nuw i64 [[ALLOCA2_SROA_30_31_INSERT_EXT]], 56
-; CHECK-NEXT:    [[ALLOCA2_SROA_30_30_INSERT_MASK_MASKED:%.*]] = or i64 [[ALLOCA2_SROA_30_29_INSERT_MASK]], [[ALLOCA2_SROA_30_29_INSERT_SHIFT]]
-; CHECK-NEXT:    [[ALLOCA2_SROA_30_31_INSERT_MASK:%.*]] = or i64 [[ALLOCA2_SROA_30_30_INSERT_MASK_MASKED]], [[ALLOCA2_SROA_30_30_INSERT_SHIFT]]
-; CHECK-NEXT:    [[ALLOCA2_SROA_30_31_INSERT_INSERT:%.*]] = or i64 [[ALLOCA2_SROA_30_31_INSERT_MASK]], [[ALLOCA2_SROA_30_31_INSERT_SHIFT]]
-; CHECK-NEXT:    [[GETELEMENTPTR216:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG]], i64 40
-; CHECK-NEXT:    [[LOAD217:%.*]] = load ptr, ptr [[GETELEMENTPTR216]], align 8
-; CHECK-NEXT:    [[LOAD220:%.*]] = load <16 x float>, ptr [[LOAD217]], align 64
-; CHECK-NEXT:    [[CALL:%.*]] = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD28]], i64 [[LOAD29]], <16 x float> [[LOAD220]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    store <16 x float> [[CALL]], ptr [[LOAD217]], align 64
-; CHECK-NEXT:    [[CALL225:%.*]] = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD73]], i64 [[LOAD76]], <16 x float> [[CALL]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    store <16 x float> [[CALL225]], ptr [[LOAD217]], align 64
-; CHECK-NEXT:    [[CALL230:%.*]] = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD121]], i64 [[LOAD124]], <16 x float> [[CALL225]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[CALL235:%.*]] = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD169]], i64 [[ALLOCA2_SROA_30_31_INSERT_INSERT]], <16 x float> [[CALL230]], i32 0, i32 0, i32 0)
-; CHECK-NEXT:    store <16 x float> [[CALL235]], ptr [[LOAD217]], align 64
-; CHECK-NEXT:    ret void
+; CHECK-NEXT:    [[CALL:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD28]], i64 0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[CALL225:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD73]], i64 0, <16 x float> [[CALL]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[CALL230:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD121]], i64 0, <16 x float> [[CALL225]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[CALL235:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD169]], i64 0, <16 x float> [[CALL230]], i32 0, i32 0, i32 0)
+; CHECK-NEXT:    ret <16 x float> [[CALL235]]
 ;
 bb:
-  %alloca = alloca %struct.eggs, align 1, addrspace(5)
-  %alloca1 = alloca %struct.pluto, align 32, addrspace(5)
-  %alloca2 = alloca %struct.pluto, align 32, addrspace(5)
-  %alloca3 = alloca %struct.snork, align 1, addrspace(5)
-  %alloca4 = alloca %struct.quux, align 8, addrspace(5)
-  %addrspacecast = addrspacecast ptr addrspace(5) %alloca to ptr
-  %addrspacecast5 = addrspacecast ptr addrspace(5) %alloca1 to ptr
-  %addrspacecast6 = addrspacecast ptr addrspace(5) %alloca2 to ptr
-  call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) %alloca1)
-  call void @llvm.memset.p5.i64(ptr addrspace(5) align 32 %alloca1, i8 0, i64 32, i1 false)
-  call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) %alloca2)
-  call void @llvm.memset.p5.i64(ptr addrspace(5) align 32 %alloca2, i8 0, i64 32, i1 false)
-  call void @llvm.lifetime.start.p5(i64 1, ptr addrspace(5) %alloca3)
-  store ptr %addrspacecast5, ptr addrspace(5) %alloca4, align 8
-  %getelementptr = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 1
   %load = load ptr, ptr %arg, align 8
-  store ptr %load, ptr addrspace(5) %getelementptr, align 8
-  %getelementptr7 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 2
-  %getelementptr8 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 1
-  %load9 = load ptr, ptr %getelementptr8, align 8
-  store ptr %load9, ptr addrspace(5) %getelementptr7, align 8
-  %getelementptr10 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 3
-  store ptr %addrspacecast, ptr addrspace(5) %getelementptr10, align 8
-  %getelementptr11 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 4
-  store ptr %addrspacecast6, ptr addrspace(5) %getelementptr11, align 8
-  %getelementptr12 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 5
-  %getelementptr13 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 2
-  %load14 = load ptr, ptr %getelementptr13, align 8
-  store ptr %load14, ptr addrspace(5) %getelementptr12, align 8
-  %getelementptr15 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 6
-  %getelementptr16 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 3
-  %load17 = load ptr, ptr %getelementptr16, align 8
-  store ptr %load17, ptr addrspace(5) %getelementptr15, align 8
-  %getelementptr18 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 7
-  %getelementptr19 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 4
-  %load20 = load ptr, ptr %getelementptr19, align 8
-  store ptr %load20, ptr addrspace(5) %getelementptr18, align 8
-  %load21 = load ptr, ptr addrspace(5) %alloca4, align 8
-  %getelementptr22 = getelementptr inbounds i8, ptr addrspace(5) %alloca4, i32 8
-  %load23 = load ptr, ptr addrspace(5) %getelementptr22, align 8
-  %getelementptr24 = getelementptr inbounds i8, ptr addrspace(5) %alloca4, i32 32
-  %load25 = load ptr, ptr addrspace(5) %getelementptr24, align 8
-  %getelementptr26 = getelementptr inbounds i8, ptr addrspace(5) %alloca4, i32 40
-  %load27 = load ptr, ptr addrspace(5) %getelementptr26, align 8
-  %load28 = load i8, ptr %load23, align 1
-  store i8 %load28, ptr %load21, align 1
-  %load29 = load i8, ptr %load27, align 1
-  store i8 %load29, ptr %load25, align 1
-  %getelementptr30 = getelementptr inbounds i8, ptr %load23, i64 1
+  %load28 = load i8, ptr %load, align 1
+  %getelementptr30 = getelementptr i8, ptr %load, i64 1
   %load31 = load i8, ptr %getelementptr30, align 1
-  %getelementptr32 = getelementptr inbounds i8, ptr %load21, i64 1
-  store i8 %load31, ptr %getelementptr32, align 1
-  %getelementptr33 = getelementptr inbounds i8, ptr %load27, i64 1
-  %load34 = load i8, ptr %getelementptr33, align 1
-  %getelementptr35 = getelementptr inbounds i8, ptr %load25, i64 1
-  store i8 %load34, ptr %getelementptr35, align 1
-  %getelementptr36 = getelementptr inbounds i8, ptr %load23, i64 2
+  %getelementptr36 = getelementptr i8, ptr %load, i64 2
   %load37 = load i8, ptr %getelementptr36, align 1
-  %getelementptr38 = getelementptr inbounds i8, ptr %load21, i64 2
-  store i8 %load37, ptr %getelementptr38, align 1
-  %getelementptr39 = getelementptr inbounds i8, ptr %load27, i64 2
-  %load40 = load i8, ptr %getelementptr39, align 1
-  %getelementptr41 = getelementptr inbounds i8, ptr %load25, i64 2
-  store i8 %load40, ptr %getelementptr41, align 1
-  %getelementptr42 = getelementptr inbounds i8, ptr %load23, i64 3
+  %getelementptr42 = getelementptr i8, ptr %load, i64 3
   %load43 = load i8, ptr %getelementptr42, align 1
-  %getelementptr44 = getelementptr inbounds i8, ptr %load21, i64 3
-  store i8 %load43, ptr %getelementptr44, align 1
-  %getelementptr45 = getelementptr inbounds i8, ptr %load27, i64 3
-  %load46 = load i8, ptr %getelementptr45, align 1
-  %getelementptr47 = getelementptr inbounds i8, ptr %load25, i64 3
-  store i8 %load46, ptr %getelementptr47, align 1
-  %getelementptr48 = getelementptr inbounds i8, ptr %load23, i64 4
+  %getelementptr48 = getelementptr i8, ptr %load, i64 4
   %load49 = load i8, ptr %getelementptr48, align 1
-  %getelementptr50 = getelementptr inbounds i8, ptr %load21, i64 4
-  store i8 %load49, ptr %getelementptr50, align 1
-  %getelementptr51 = getelementptr inbounds i8, ptr %load27, i64 4
-  %load52 = load i8, ptr %getelementptr51, align 1
-  %getelementptr53 = getelementptr inbounds i8, ptr %load25, i64 4
-  store i8 %load52, ptr %getelementptr53, align 1
-  %getelementptr54 = getelementptr inbounds i8, ptr %load23, i64 5
+  %getelementptr54 = getelementptr i8, ptr %load, i64 5
   %load55 = load i8, ptr %getelementptr54, align 1
-  %getelementptr56 = getelementptr inbounds i8, ptr %load21, i64 5
-  store i8 %load55, ptr %getelementptr56, align 1
-  %getelementptr57 = getelementptr inbounds i8, ptr %load27, i64 5
-  %load58 = load i8, ptr %getelementptr57, align 1
-  %getelementptr59 = getelementptr inbounds i8, ptr %load25, i64 5
-  store i8 %load58, ptr %getelementptr59, align 1
-  %getelementptr60 = getelementptr inbounds i8, ptr %load23, i64 6
+  %getelementptr60 = getelementptr i8, ptr %load, i64 6
   %load61 = load i8, ptr %getelementptr60, align 1
-  %getelementptr62 = getelementptr inbounds i8, ptr %load21, i64 6
-  store i8 %load61, ptr %getelementptr62, align 1
-  %getelementptr63 = getelementptr inbounds i8, ptr %load27, i64 6
-  %load64 = load i8, ptr %getelementptr63, align 1
-  %getelementptr65 = getelementptr inbounds i8, ptr %load25, i64 6
-  store i8 %load64, ptr %getelementptr65, align 1
-  %getelementptr66 = getelementptr inbounds i8, ptr %load23, i64 7
+  %getelementptr66 = getelementptr i8, ptr %load, i64 7
   %load67 = load i8, ptr %getelementptr66, align 1
-  %getelementptr68 = getelementptr inbounds i8, ptr %load21, i64 7
-  store i8 %load67, ptr %getelementptr68, align 1
-  %getelementptr69 = getelementptr inbounds i8, ptr %load27, i64 7
-  %load70 = load i8, ptr %getelementptr69, align 1
-  %getelementptr71 = getelementptr inbounds i8, ptr %load25, i64 7
-  store i8 %load70, ptr %getelementptr71, align 1
-  %getelementptr72 = getelementptr inbounds i8, ptr %load23, i64 8
+  %getelementptr72 = getelementptr i8, ptr %load, i64 8
   %load73 = load i8, ptr %getelementptr72, align 1
-  %getelementptr74 = getelementptr inbounds i8, ptr %load21, i64 8
-  store i8 %load73, ptr %getelementptr74, align 1
-  %getelementptr75 = getelementptr inbounds i8, ptr %load27, i64 8
-  %load76 = load i8, ptr %getelementptr75, align 1
-  %getelementptr77 = getelementptr inbounds i8, ptr %load25, i64 8
-  store i8 %load76, ptr %getelementptr77, align 1
-  %getelementptr78 = getelementptr inbounds i8, ptr %load23, i64 9
+  %getelementptr78 = getelementptr i8, ptr %load, i64 9
   %load79 = load i8, ptr %getelementptr78, align 1
-  %getelementptr80 = getelementptr inbounds i8, ptr %load21, i64 9
-  store i8 %load79, ptr %getelementptr80, align 1
-  %getelementptr81 = getelementptr inbounds i8, ptr %load27, i64 9
-  %load82 = load i8, ptr %getelementptr81, align 1
-  %getelementptr83 = getelementptr inbounds i8, ptr %load25, i64 9
-  store i8 %load82, ptr %getelementptr83, align 1
-  %getelementptr84 = getelementptr inbounds i8, ptr %load23, i64 10
+  %getelementptr84 = getelementptr i8, ptr %load, i64 10
   %load85 = load i8, ptr %getelementptr84, align 1
-  %getelementptr86 = getelementptr inbounds i8, ptr %load21, i64 10
-  store i8 %load85, ptr %getelementptr86, align 1
-  %getelementptr87 = getelementptr inbounds i8, ptr %load27, i64 10
-  %load88 = load i8, ptr %getelementptr87, align 1
-  %getelementptr89 = getelementptr inbounds i8, ptr %load25, i64 10
-  store i8 %load88, ptr %getelementptr89, align 1
-  %getelementptr90 = getelementptr inbounds i8, ptr %load23, i64 11
+  %getelementptr90 = getelementptr i8, ptr %load, i64 11
   %load91 = load i8, ptr %getelementptr90, align 1
-  %getelementptr92 = getelementptr inbounds i8, ptr %load21, i64 11
-  store i8 %load91, ptr %getelementptr92, align 1
-  %getelementptr93 = getelementptr inbounds i8, ptr %load27, i64 11
-  %load94 = load i8, ptr %getelementptr93, align 1
-  %getelementptr95 = getelementptr inbounds i8, ptr %load25, i64 11
-  store i8 %load94, ptr %getelementptr95, align 1
-  %getelementptr96 = getelementptr inbounds i8, ptr %load23, i64 12
+  %getelementptr96 = getelementptr i8, ptr %load, i64 12
   %load97 = load i8, ptr %getelementptr96, align 1
-  %getelementptr98 = getelementptr inbounds i8, ptr %load21, i64 12
-  store i8 %load97, ptr %getelementptr98, align 1
-  %getelementptr99 = getelementptr inbounds i8, ptr %load27, i64 12
-  %load100 = load i8, ptr %getelementptr99, align 1
-  %getelementptr101 = getelementptr inbounds i8, ptr %load25, i64 12
-  store i8 %load100, ptr %getelementptr101, align 1
-  %getelementptr102 = getelementptr inbounds i8, ptr %load23, i64 13
+  %getelementptr102 = getelementptr i8, ptr %load, i64 13
   %load103 = load i8, ptr %getelementptr102, align 1
-  %getelementptr104 = getelementptr inbounds i8, ptr %load21, i64 13
-  store i8 %load103, ptr %getelementptr104, align 1
-  %getelementptr105 = getelementptr inbounds i8, ptr %load27, i64 13
-  %load106 = load i8, ptr %getelementptr105, align 1
-  %getelementptr107 = getelementptr inbounds i8, ptr %load25, i64 13
-  store i8 %load106, ptr %getelementptr107, align 1
-  %getelementptr108 = getelementptr inbounds i8, ptr %load23, i64 14
+  %getelementptr108 = getelementptr i8, ptr %load, i64 14
   %load109 = load i8, ptr %getelementptr108, align 1
-  %getelementptr110 = getelementptr inbounds i8, ptr %load21, i64 14
-  store i8 %load109, ptr %getelementptr110, align 1
-  %getelementptr111 = getelementptr inbounds i8, ptr %load27, i64 14
-  %load112 = load i8, ptr %getelementptr111, align 1
-  %getelementptr113 = getelementptr inbounds i8, ptr %load25, i64 14
-  store i8 %load112, ptr %getelementptr113, align 1
-  %getelementptr114 = getelementptr inbounds i8, ptr %load23, i64 15
+  %getelementptr114 = getelementptr i8, ptr %load, i64 15
   %load115 = load i8, ptr %getelementptr114, align 1
-  %getelementptr116 = getelementptr inbounds i8, ptr %load21, i64 15
-  store i8 %load115, ptr %getelementptr116, align 1
-  %getelementptr117 = getelementptr inbounds i8, ptr %load27, i64 15
-  %load118 = load i8, ptr %getelementptr117, align 1
-  %getelementptr119 = getelementptr inbounds i8, ptr %load25, i64 15
-  store i8 %load118, ptr %getelementptr119, align 1
-  %getelementptr120 = getelementptr inbounds i8, ptr %load23, i64 16
+  %getelementptr120 = getelementptr i8, ptr %load, i64 16
   %load121 = load i8, ptr %getelementptr120, align 1
-  %getelementptr122 = getelementptr inbounds i8, ptr %load21, i64 16
-  store i8 %load121, ptr %getelementptr122, align 1
-  %getelementptr123 = getelementptr inbounds i8, ptr %load27, i64 16
-  %load124 = load i8, ptr %getelementptr123, align 1
-  %getelementptr125 = getelementptr inbounds i8, ptr %load25, i64 16
-  store i8 %load124, ptr %getelementptr125, align 1
-  %getelementptr126 = getelementptr inbounds i8, ptr %load23, i64 17
+  %getelementptr126 = getelementptr i8, ptr %load, i64 17
   %load127 = load i8, ptr %getelementptr126, align 1
-  %getelementptr128 = getelementptr inbounds i8, ptr %load21, i64 17
-  store i8 %load127, ptr %getelementptr128, align 1
-  %getelementptr129 = getelementptr inbounds i8, ptr %load27, i64 17
-  %load130 = load i8, ptr %getelementptr129, align 1
-  %getelementptr131 = getelementptr inbounds i8, ptr %load25, i64 17
-  store i8 %load130, ptr %getelementptr131, align 1
-  %getelementptr132 = getelementptr inbounds i8, ptr %load23, i64 18
+  %getelementptr132 = getelementptr i8, ptr %load, i64 18
   %load133 = load i8, ptr %getelementptr132, align 1
-  %getelementptr134 = getelementptr inbounds i8, ptr %load21, i64 18
-  store i8 %load133, ptr %getelementptr134, align 1
-  %getelementptr135 = getelementptr inbounds i8, ptr %load27, i64 18
-  %load136 = load i8, ptr %getelementptr135, align 1
-  %getelementptr137 = getelementptr inbounds i8, ptr %load25, i64 18
-  store i8 %load136, ptr %getelementptr137, align 1
-  %getelementptr138 = getelementptr inbounds i8, ptr %load23, i64 19
+  %getelementptr138 = getelementptr i8, ptr %load, i64 19
   %load139 = load i8, ptr %getelementptr138, align 1
-  %getelementptr140 = getelementptr inbounds i8, ptr %load21, i64 19
-  store i8 %load139, ptr %getelementptr140, align 1
-  %getelementptr141 = getelementptr inbounds i8, ptr %load27, i64 19
-  %load142 = load i8, ptr %getelementptr141, align 1
-  %getelementptr143 = getelementptr inbounds i8, ptr %load25, i64 19
-  store i8 %load142, ptr %getelementptr143, align 1
-  %getelementptr144 = getelementptr inbounds i8, ptr %load23, i64 20
+  %getelementptr144 = getelementptr i8, ptr %load, i64 20
   %load145 = load i8, ptr %getelementptr144, align 1
-  %getelementptr146 = getelementptr inbounds i8, ptr %load21, i64 20
-  store i8 %load145, ptr %getelementptr146, align 1
-  %getelementptr147 = getelementptr inbounds i8, ptr %load27, i64 20
-  %load148 = load i8, ptr %getelementptr147, align 1
-  %getelementptr149 = getelementptr inbounds i8, ptr %load25, i64 20
-  store i8 %load148, ptr %getelementptr149, align 1
-  %getelementptr150 = getelementptr inbounds i8, ptr %load23, i64 21
+  %getelementptr150 = getelementptr i8, ptr %load, i64 21
   %load151 = load i8, ptr %getelementptr150, align 1
-  %getelementptr152 = getelementptr inbounds i8, ptr %load21, i64 21
-  store i8 %load151, ptr %getelementptr152, align 1
-  %getelementptr153 = getelementptr inbounds i8, ptr %load27, i64 21
-  %load154 = load i8, ptr %getelementptr153, align 1
-  %getelementptr155 = getelementptr inbounds i8, ptr %load25, i64 21
-  store i8 %load154, ptr %getelementptr155, align 1
-  %getelementptr156 = getelementptr inbounds i8, ptr %load23, i64 22
+  %getelementptr156 = getelementptr i8, ptr %load, i64 22
   %load157 = load i8, ptr %getelementptr156, align 1
-  %getelementptr158 = getelementptr inbounds i8, ptr %load21, i64 22
-  store i8 %load157, ptr %getelementptr158, align 1
-  %getelementptr159 = getelementptr inbounds i8, ptr %load27, i64 22
-  %load160 = load i8, ptr %getelementptr159, align 1
-  %getelementptr161 = getelementptr inbounds i8, ptr %load25, i64 22
-  store i8 %load160, ptr %getelementptr161, align 1
-  %getelementptr162 = getelementptr inbounds i8, ptr %load23, i64 23
+  %getelementptr162 = getelementptr i8, ptr %load, i64 23
   %load163 = load i8, ptr %getelementptr162, align 1
-  %getelementptr164 = getelementptr inbounds i8, ptr %load21, i64 23
-  store i8 %load163, ptr %getelementptr164, align 1
-  %getelementptr165 = getelementptr inbounds i8, ptr %load27, i64 23
-  %load166 = load i8, ptr %getelementptr165, align 1
-  %getelementptr167 = getelementptr inbounds i8, ptr %load25, i64 23
-  store i8 %load166, ptr %getelementptr167, align 1
-  %getelementptr168 = getelementptr inbounds i8, ptr %load23, i64 24
+  %getelementptr168 = getelementptr i8, ptr %load, i64 24
   %load169 = load i8, ptr %getelementptr168, align 1
-  %getelementptr170 = getelementptr inbounds i8, ptr %load21, i64 24
-  store i8 %load169, ptr %getelementptr170, align 1
-  %getelementptr171 = getelementptr inbounds i8, ptr %load27, i64 24
-  %load172 = load i8, ptr %getelementptr171, align 1
-  %getelementptr173 = getelementptr inbounds i8, ptr %load25, i64 24
-  store i8 %load172, ptr %getelementptr173, align 1
-  %getelementptr174 = getelementptr inbounds i8, ptr %load23, i64 25
+  %getelementptr174 = getelementptr i8, ptr %load, i64 25
   %load175 = load i8, ptr %getelementptr174, align 1
-  %getelementptr176 = getelementptr inbounds i8, ptr %load21, i64 25
-  store i8 %load175, ptr %getelementptr176, align 1
-  %getelementptr177 = getelementptr inbounds i8, ptr %load27, i64 25
-  %load178 = load i8, ptr %getelementptr177, align 1
-  %getelementptr179 = getelementptr inbounds i8, ptr %load25, i64 25
-  store i8 %load178, ptr %getelementptr179, align 1
-  %getelementptr180 = getelementptr inbounds i8, ptr %load23, i64 26
+  %getelementptr180 = getelementptr i8, ptr %load, i64 26
   %load181 = load i8, ptr %getelementptr180, align 1
-  %getelementptr182 = getelementptr inbounds i8, ptr %load21, i64 26
-  store i8 %load181, ptr %getelementptr182, align 1
-  %getelementptr183 = getelementptr inbounds i8, ptr %load27, i64 26
-  %load184 = load i8, ptr %getelementptr183, align 1
-  %getelementptr185 = getelementptr inbounds i8, ptr %load25, i64 26
-  store i8 %load184, ptr %getelementptr185, align 1
-  %getelementptr186 = getelementptr inbounds i8, ptr %load23, i64 27
+  %getelementptr186 = getelementptr i8, ptr %load, i64 27
   %load187 = load i8, ptr %getelementptr186, align 1
-  %getelementptr188 = getelementptr inbounds i8, ptr %load21, i64 27
-  store i8 %load187, ptr %getelementptr188, align 1
-  %getelementptr189 = getelementptr inbounds i8, ptr %load27, i64 27
-  %load190 = load i8, ptr %getelementptr189, align 1
-  %getelementptr191 = getelementptr inbounds i8, ptr %load25, i64 27
-  store i8 %load190, ptr %getelementptr191, align 1
-  %getelementptr192 = getelementptr inbounds i8, ptr %load23, i64 28
+  %getelementptr192 = getelementptr i8, ptr %load, i64 28
   %load193 = load i8, ptr %getelementptr192, align 1
-  %getelementptr194 = getelementptr inbounds i8, ptr %load21, i64 28
-  store i8 %load193, ptr %getelementptr194, align 1
-  %getelementptr195 = getelementptr inbounds i8, ptr %load27, i64 28
-  %load196 = load i8, ptr %getelementptr195, align 1
-  %getelementptr197 = getelementptr inbounds i8, ptr %load25, i64 28
-  store i8 %load196, ptr %getelementptr197, align 1
-  %getelementptr198 = getelementptr inbounds i8, ptr %load23, i64 29
+  %getelementptr198 = getelementptr i8, ptr %load, i64 29
   %load199 = load i8, ptr %getelementptr198, align 1
-  %getelementptr200 = getelementptr inbounds i8, ptr %load21, i64 29
-  store i8 %load199, ptr %getelementptr200, align 1
-  %getelementptr201 = getelementptr inbounds i8, ptr %load27, i64 29
-  %load202 = load i8, ptr %getelementptr201, align 1
-  %getelementptr203 = getelementptr inbounds i8, ptr %load25, i64 29
-  store i8 %load202, ptr %getelementptr203, align 1
-  %getelementptr204 = getelementptr inbounds i8, ptr %load23, i64 30
+  %getelementptr204 = getelementptr i8, ptr %load, i64 30
   %load205 = load i8, ptr %getelementptr204, align 1
-  %getelementptr206 = getelementptr inbounds i8, ptr %load21, i64 30
-  store i8 %load205, ptr %getelementptr206, align 1
-  %getelementptr207 = getelementptr inbounds i8, ptr %load27, i64 30
-  %load208 = load i8, ptr %getelementptr207, align 1
-  %getelementptr209 = getelementptr inbounds i8, ptr %load25, i64 30
-  store i8 %load208, ptr %getelementptr209, align 1
-  %getelementptr210 = getelementptr inbounds i8, ptr %load23, i64 31
+  %getelementptr210 = getelementptr i8, ptr %load, i64 31
   %load211 = load i8, ptr %getelementptr210, align 1
-  %getelementptr212 = getelementptr inbounds i8, ptr %load21, i64 31
-  store i8 %load211, ptr %getelementptr212, align 1
-  %getelementptr213 = getelementptr inbounds i8, ptr %load27, i64 31
-  %load214 = load i8, ptr %getelementptr213, align 1
-  %getelementptr215 = getelementptr inbounds i8, ptr %load25, i64 31
-  store i8 %load214, ptr %getelementptr215, align 1
-  call void @llvm.lifetime.end.p5(i64 1, ptr addrspace(5) %alloca3)
-  %getelementptr216 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 5
-  %load217 = load ptr, ptr %getelementptr216, align 8
-  %load218 = load i64, ptr addrspace(5) %alloca1, align 8
-  %load219 = load i64, ptr addrspace(5) %alloca2, align 8
-  %load220 = load <16 x float>, ptr %load217, align 64
-  %call = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %load218, i64 %load219, <16 x float> %load220, i32 0, i32 0, i32 0)
-  store <16 x float> %call, ptr %load217, align 64
-  %getelementptr221 = getelementptr inbounds i8, ptr addrspace(5) %alloca1, i64 8
-  %getelementptr222 = getelementptr inbounds i8, ptr addrspace(5) %alloca2, i64 8
-  %load223 = load i64, ptr addrspace(5) %getelementptr221, align 8
-  %load224 = load i64, ptr addrspace(5) %getelementptr222, align 8
-  %call225 = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %load223, i64 %load224, <16 x float> %call, i32 0, i32 0, i32 0)
-  store <16 x float> %call225, ptr %load217, align 64
-  %getelementptr226 = getelementptr inbounds i8, ptr addrspace(5) %alloca1, i64 16
-  %getelementptr227 = getelementptr inbounds i8, ptr addrspace(5) %alloca2, i64 16
-  %load228 = load i64, ptr addrspace(5) %getelementptr226, align 8
-  %load229 = load i64, ptr addrspace(5) %getelementptr227, align 8
-  %call230 = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %load228, i64 %load229, <16 x float> %call225, i32 0, i32 0, i32 0)
-  store <16 x float> %call230, ptr %load217, align 64
-  %getelementptr231 = getelementptr inbounds i8, ptr addrspace(5) %alloca1, i64 24
-  %getelementptr232 = getelementptr inbounds i8, ptr addrspace(5) %alloca2, i64 24
-  %load233 = load i64, ptr addrspace(5) %getelementptr231, align 8
-  %load234 = load i64, ptr addrspace(5) %getelementptr232, align 8
-  %call235 = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %load233, i64 %load234, <16 x float> %call230, i32 0, i32 0, i32 0)
-  store <16 x float> %call235, ptr %load217, align 64
-  call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) %alloca2)
-  call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) %alloca1)
-  ret void
+  %alloca1.sroa.8.0.insert.ext = zext i8 %load67 to i64
+  %alloca1.sroa.8.0.insert.shift = shl i64 %alloca1.sroa.8.0.insert.ext, 56
+  %alloca1.sroa.7.0.insert.ext = zext i8 %load61 to i64
+  %alloca1.sroa.7.0.insert.shift = shl i64 %alloca1.sroa.7.0.insert.ext, 48
+  %alloca1.sroa.7.0.insert.insert = or i64 %alloca1.sroa.8.0.insert.shift, %alloca1.sroa.7.0.insert.shift
+  %alloca1.sroa.6.0.insert.ext = zext i8 %load55 to i64
+  %alloca1.sroa.6.0.insert.shift = shl i64 %alloca1.sroa.6.0.insert.ext, 40
+  %alloca1.sroa.6.0.insert.insert = or i64 %alloca1.sroa.7.0.insert.insert, %alloca1.sroa.6.0.insert.shift
+  %alloca1.sroa.5.0.insert.ext = zext i8 %load49 to i64
+  %alloca1.sroa.5.0.insert.shift = shl i64 %alloca1.sroa.5.0.insert.ext, 32
+  %alloca1.sroa.5.0.insert.insert = or i64 %alloca1.sroa.6.0.insert.insert, %alloca1.sroa.5.0.insert.shift
+  %alloca1.sroa.4.0.insert.ext = zext i8 %load43 to i64
+  %alloca1.sroa.4.0.insert.shift = shl i64 %alloca1.sroa.4.0.insert.ext, 24
+  %alloca1.sroa.4.0.insert.insert = or i64 %alloca1.sroa.5.0.insert.insert, %alloca1.sroa.4.0.insert.shift
+  %alloca1.sroa.3.0.insert.ext = zext i8 %load37 to i64
+  %alloca1.sroa.3.0.insert.shift = shl i64 %alloca1.sroa.3.0.insert.ext, 16
+  %alloca1.sroa.2.0.insert.ext = zext i8 %load31 to i64
+  %alloca1.sroa.2.0.insert.shift = shl i64 %alloca1.sroa.2.0.insert.ext, 8
+  %alloca1.sroa.2.0.insert.mask = or i64 %alloca1.sroa.4.0.insert.insert, %alloca1.sroa.3.0.insert.shift
+  %alloca1.sroa.0.0.insert.ext = zext i8 %load28 to i64
+  %alloca1.sroa.0.0.insert.mask = or i64 %alloca1.sroa.2.0.insert.mask, %alloca1.sroa.2.0.insert.shift
+  %alloca1.sroa.0.0.insert.insert = or i64 %alloca1.sroa.0.0.insert.mask, %alloca1.sroa.0.0.insert.ext
+  %call = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.0.0.insert.insert, i64 0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+  %alloca1.sroa.17.8.insert.ext = zext i8 %load115 to i64
+  %alloca1.sroa.17.8.insert.shift = shl i64 %alloca1.sroa.17.8.insert.ext, 56
+  %alloca1.sroa.16.8.insert.ext = zext i8 %load109 to i64
+  %alloca1.sroa.16.8.insert.shift = shl i64 %alloca1.sroa.16.8.insert.ext, 48
+  %alloca1.sroa.16.8.insert.insert = or i64 %alloca1.sroa.17.8.insert.shift, %alloca1.sroa.16.8.insert.shift
+  %alloca1.sroa.15.8.insert.ext = zext i8 %load103 to i64
+  %alloca1.sroa.15.8.insert.shift = shl i64 %alloca1.sroa.15.8.insert.ext, 40
+  %alloca1.sroa.15.8.insert.insert = or i64 %alloca1.sroa.16.8.insert.insert, %alloca1.sroa.15.8.insert.shift
+  %alloca1.sroa.14.8.insert.ext = zext i8 %load97 to i64
+  %alloca1.sroa.14.8.insert.shift = shl i64 %alloca1.sroa.14.8.insert.ext, 32
+  %alloca1.sroa.14.8.insert.insert = or i64 %alloca1.sroa.15.8.insert.insert, %alloca1.sroa.14.8.insert.shift
+  %alloca1.sroa.13.8.insert.ext = zext i8 %load91 to i64
+  %alloca1.sroa.13.8.insert.shift = shl i64 %alloca1.sroa.13.8.insert.ext, 24
+  %alloca1.sroa.13.8.insert.insert = or i64 %alloca1.sroa.14.8.insert.insert, %alloca1.sroa.13.8.insert.shift
+  %alloca1.sroa.12.8.insert.ext = zext i8 %load85 to i64
+  %alloca1.sroa.12.8.insert.shift = shl i64 %alloca1.sroa.12.8.insert.ext, 16
+  %alloca1.sroa.11.8.insert.ext = zext i8 %load79 to i64
+  %alloca1.sroa.11.8.insert.shift = shl i64 %alloca1.sroa.11.8.insert.ext, 8
+  %alloca1.sroa.11.8.insert.mask = or i64 %alloca1.sroa.13.8.insert.insert, %alloca1.sroa.12.8.insert.shift
+  %alloca1.sroa.9.8.insert.ext = zext i8 %load73 to i64
+  %alloca1.sroa.9.8.insert.mask = or i64 %alloca1.sroa.11.8.insert.mask, %alloca1.sroa.11.8.insert.shift
+  %alloca1.sroa.9.8.insert.insert = or i64 %alloca1.sroa.9.8.insert.mask, %alloca1.sroa.9.8.insert.ext
+  %call225 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.9.8.insert.insert, i64 0, <16 x float> %call, i32 0, i32 0, i32 0)
+  %alloca1.sroa.26.16.insert.ext = zext i8 %load163 to i64
+  %alloca1.sroa.26.16.insert.shift = shl i64 %alloca1.sroa.26.16.insert.ext, 56
+  %alloca1.sroa.25.16.insert.ext = zext i8 %load157 to i64
+  %alloca1.sroa.25.16.insert.shift = shl i64 %alloca1.sroa.25.16.insert.ext, 48
+  %alloca1.sroa.25.16.insert.insert = or i64 %alloca1.sroa.26.16.insert.shift, %alloca1.sroa.25.16.insert.shift
+  %alloca1.sroa.24.16.insert.ext = zext i8 %load151 to i64
+  %alloca1.sroa.24.16.insert.shift = shl i64 %alloca1.sroa.24.16.insert.ext, 40
+  %alloca1.sroa.24.16.insert.insert = or i64 %alloca1.sroa.25.16.insert.insert, %alloca1.sroa.24.16.insert.shift
+  %alloca1.sroa.23.16.insert.ext = zext i8 %load145 to i64
+  %alloca1.sroa.23.16.insert.shift = shl i64 %alloca1.sroa.23.16.insert.ext, 32
+  %alloca1.sroa.23.16.insert.insert = or i64 %alloca1.sroa.24.16.insert.insert, %alloca1.sroa.23.16.insert.shift
+  %alloca1.sroa.22.16.insert.ext = zext i8 %load139 to i64
+  %alloca1.sroa.22.16.insert.shift = shl i64 %alloca1.sroa.22.16.insert.ext, 24
+  %alloca1.sroa.22.16.insert.insert = or i64 %alloca1.sroa.23.16.insert.insert, %alloca1.sroa.22.16.insert.shift
+  %alloca1.sroa.21.16.insert.ext = zext i8 %load133 to i64
+  %alloca1.sroa.21.16.insert.shift = shl i64 %alloca1.sroa.21.16.insert.ext, 16
+  %alloca1.sroa.20.16.insert.ext = zext i8 %load127 to i64
+  %alloca1.sroa.20.16.insert.shift = shl i64 %alloca1.sroa.20.16.insert.ext, 8
+  %alloca1.sroa.20.16.insert.mask = or i64 %alloca1.sroa.22.16.insert.insert, %alloca1.sroa.21.16.insert.shift
+  %alloca1.sroa.18.16.insert.ext = zext i8 %load121 to i64
+  %alloca1.sroa.18.16.insert.mask = or i64 %alloca1.sroa.20.16.insert.mask, %alloca1.sroa.20.16.insert.shift
+  %alloca1.sroa.18.16.insert.insert = or i64 %alloca1.sroa.18.16.insert.mask, %alloca1.sroa.18.16.insert.ext
+  %call230 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.18.16.insert.insert, i64 0, <16 x float> %call225, i32 0, i32 0, i32 0)
+  %alloca1.sroa.35.24.insert.ext = zext i8 %load211 to i64
+  %alloca1.sroa.35.24.insert.shift = shl i64 %alloca1.sroa.35.24.insert.ext, 56
+  %alloca1.sroa.34.24.insert.ext = zext i8 %load205 to i64
+  %alloca1.sroa.34.24.insert.shift = shl i64 %alloca1.sroa.34.24.insert.ext, 48
+  %alloca1.sroa.34.24.insert.insert = or i64 %alloca1.sroa.35.24.insert.shift, %alloca1.sroa.34.24.insert.shift
+  %alloca1.sroa.33.24.insert.ext = zext i8 %load199 to i64
+  %alloca1.sroa.33.24.insert.shift = shl i64 %alloca1.sroa.33.24.insert.ext, 40
+  %alloca1.sroa.33.24.insert.insert = or i64 %alloca1.sroa.34.24.insert.insert, %alloca1.sroa.33.24.insert.shift
+  %alloca1.sroa.32.24.insert.ext = zext i8 %load193 to i64
+  %alloca1.sroa.32.24.insert.shift = shl i64 %alloca1.sroa.32.24.insert.ext, 32
+  %alloca1.sroa.32.24.insert.insert = or i64 %alloca1.sroa.33.24.insert.insert, %alloca1.sroa.32.24.insert.shift
+  %alloca1.sroa.31.24.insert.ext = zext i8 %load187 to i64
+  %alloca1.sroa.31.24.insert.shift = shl i64 %alloca1.sroa.31.24.insert.ext, 24
+  %alloca1.sroa.31.24.insert.insert = or i64 %alloca1.sroa.32.24.insert.insert, %alloca1.sroa.31.24.insert.shift
+  %alloca1.sroa.30.24.insert.ext = zext i8 %load181 to i64
+  %alloca1.sroa.30.24.insert.shift = shl i64 %alloca1.sroa.30.24.insert.ext, 16
+  %alloca1.sroa.29.24.insert.ext = zext i8 %load175 to i64
+  %alloca1.sroa.29.24.insert.shift = shl i64 %alloca1.sroa.29.24.insert.ext, 8
+  %alloca1.sroa.29.24.insert.mask = or i64 %alloca1.sroa.31.24.insert.insert, %alloca1.sroa.30.24.insert.shift
+  %alloca1.sroa.27.24.insert.ext = zext i8 %load169 to i64
+  %alloca1.sroa.27.24.insert.mask = or i64 %alloca1.sroa.29.24.insert.mask, %alloca1.sroa.29.24.insert.shift
+  %alloca1.sroa.27.24.insert.insert = or i64 %alloca1.sroa.27.24.insert.mask, %alloca1.sroa.27.24.insert.ext
+  %call235 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.27.24.insert.insert, i64 0, <16 x float> %call230, i32 0, i32 0, i32 0)
+  ret <16 x float> %call235
 }
 
 ; Function Attrs: convergent nocallback nofree nosync nounwind willreturn memory(none)
 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64, i64, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #0
 
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) captures(none)) #1
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) captures(none)) #1
-
-; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
-declare void @llvm.memset.p5.i64(ptr addrspace(5) writeonly captures(none), i8, i64, i1 immarg) #2
+; uselistorder directives
+uselistorder ptr @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8, { 3, 2, 1, 0 }
 
 attributes #0 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
-attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }

>From c14b85478f03b0f532e58efc9811bb2f715cd6e9 Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Fri, 12 Sep 2025 07:14:26 -0500
Subject: [PATCH 4/5] fixup! AMDGPU: Report unaligned scratch access as fast if
 supported by tgt

---
 .../AggressiveInstCombine/{ => AMDGPU}/fold-consecutive-loads.ll  | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/Transforms/AggressiveInstCombine/{ => AMDGPU}/fold-consecutive-loads.ll (100%)

diff --git a/llvm/test/Transforms/AggressiveInstCombine/fold-consecutive-loads.ll b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll
similarity index 100%
rename from llvm/test/Transforms/AggressiveInstCombine/fold-consecutive-loads.ll
rename to llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll

>From 2db9a07d405c2835c8e1960a3232b7b416d1bb6a Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Fri, 12 Sep 2025 07:14:26 -0500
Subject: [PATCH 5/5] fixup! AMDGPU: Report unaligned scratch access as fast if
 supported by tgt

---
 .../AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll    | 4 ----
 .../Transforms/AggressiveInstCombine/AMDGPU/lit.local.cfg     | 2 ++
 2 files changed, 2 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/AggressiveInstCombine/AMDGPU/lit.local.cfg

diff --git a/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll
index 8fe242598ef0c..05d2330fffc7f 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll
@@ -229,10 +229,6 @@ bb:
   ret <16 x float> %call235
 }
 
-; Function Attrs: convergent nocallback nofree nosync nounwind willreturn memory(none)
 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64, i64, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #0
 
-; uselistorder directives
-uselistorder ptr @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8, { 3, 2, 1, 0 }
-
 attributes #0 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/lit.local.cfg b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/lit.local.cfg
new file mode 100644
index 0000000000000..7c492428aec76
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "AMDGPU" in config.root.targets:
+    config.unsupported = True



More information about the llvm-commits mailing list