[llvm] [AMDGPU] Enable more consecutive load folding during aggressive-instcombine (PR #158036)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 12 01:07:43 PDT 2025
https://github.com/macurtis-amd updated https://github.com/llvm/llvm-project/pull/158036
>From 72912dc5bfd9590c0ca8a91355f7fffe2ea6d3eb Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Fri, 27 Jun 2025 13:05:11 -0500
Subject: [PATCH 1/4] [AMDGPU] Enable more consecutive load folding during
aggressive-instcombine
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 8 +-
.../CodeGen/AMDGPU/fold-consecutive-loads.ll | 457 ++++++++++++++
.../test/CodeGen/AMDGPU/memcpy-fixed-align.ll | 16 +-
llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 152 +++--
.../AMDGPU/memcpy-param-combinations.ll | 558 ++++++++----------
.../AMDGPU/memmove-param-combinations.ll | 64 +-
6 files changed, 815 insertions(+), 440 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/fold-consecutive-loads.ll
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index edce4856f77b0..f8e5740880708 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2090,10 +2090,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
bool AlignedBy4 = Alignment >= Align(4);
+ if (Subtarget->hasUnalignedScratchAccessEnabled()) {
+ if (IsFast)
+ *IsFast = AlignedBy4 ? Size : 1;
+ return true;
+ }
+
if (IsFast)
*IsFast = AlignedBy4;
- return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled();
+ return AlignedBy4;
}
// So long as they are correct, wide global memory operations perform better
diff --git a/llvm/test/CodeGen/AMDGPU/fold-consecutive-loads.ll b/llvm/test/CodeGen/AMDGPU/fold-consecutive-loads.ll
new file mode 100644
index 0000000000000..610760f788ea8
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-consecutive-loads.ll
@@ -0,0 +1,457 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=sroa,instcombine,aggressive-instcombine %s -S -o - | FileCheck %s
+
+define i64 @quux(ptr %arg) {
+; CHECK-LABEL: define i64 @quux(
+; CHECK-SAME: ptr [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[ARG]], align 1
+; CHECK-NEXT: ret i64 [[LOAD]]
+;
+bb:
+ %load = load i8, ptr %arg, align 1
+ %getelementptr = getelementptr inbounds nuw i8, ptr %arg, i64 1
+ %load1 = load i8, ptr %getelementptr, align 1
+ %getelementptr2 = getelementptr inbounds nuw i8, ptr %arg, i64 2
+ %load3 = load i8, ptr %getelementptr2, align 1
+ %getelementptr4 = getelementptr inbounds nuw i8, ptr %arg, i64 3
+ %load5 = load i8, ptr %getelementptr4, align 1
+ %getelementptr6 = getelementptr inbounds nuw i8, ptr %arg, i64 4
+ %load7 = load i8, ptr %getelementptr6, align 1
+ %getelementptr8 = getelementptr inbounds nuw i8, ptr %arg, i64 5
+ %load9 = load i8, ptr %getelementptr8, align 1
+ %getelementptr10 = getelementptr inbounds nuw i8, ptr %arg, i64 6
+ %load11 = load i8, ptr %getelementptr10, align 1
+ %getelementptr12 = getelementptr inbounds nuw i8, ptr %arg, i64 7
+ %load13 = load i8, ptr %getelementptr12, align 1
+ %zext = zext i8 %load13 to i64
+ %shl = shl nuw i64 %zext, 56
+ %zext14 = zext i8 %load11 to i64
+ %shl15 = shl nuw nsw i64 %zext14, 48
+ %or = or disjoint i64 %shl, %shl15
+ %zext16 = zext i8 %load9 to i64
+ %shl17 = shl nuw nsw i64 %zext16, 40
+ %or18 = or disjoint i64 %or, %shl17
+ %zext19 = zext i8 %load7 to i64
+ %shl20 = shl nuw nsw i64 %zext19, 32
+ %or21 = or disjoint i64 %or18, %shl20
+ %zext22 = zext i8 %load5 to i64
+ %shl23 = shl nuw nsw i64 %zext22, 24
+ %or24 = or disjoint i64 %or21, %shl23
+ %zext25 = zext i8 %load3 to i64
+ %shl26 = shl nuw nsw i64 %zext25, 16
+ %zext27 = zext i8 %load1 to i64
+ %shl28 = shl nuw nsw i64 %zext27, 8
+ %or29 = or disjoint i64 %or24, %shl26
+ %zext30 = zext i8 %load to i64
+ %or31 = or i64 %or29, %shl28
+ %or32 = or i64 %or31, %zext30
+ ret i64 %or32
+}
+
+
+; The following test case reduced from a client kernel
+%struct.eggs = type { i8 }
+%struct.pluto = type { %struct.spam }
+%struct.spam = type { <32 x i8> }
+%struct.snork = type { i8 }
+%struct.quux = type { ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr }
+%struct.bar = type { ptr, ptr, ptr, ptr, ptr, ptr }
+
+define fastcc void @hoge(ptr noundef nonnull readonly align 8 captures(none) dereferenceable(48) %arg) {
+; CHECK-LABEL: define fastcc void @hoge(
+; CHECK-SAME: ptr noundef nonnull readonly align 8 captures(none) dereferenceable(48) [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr [[ARG]], align 8
+; CHECK-NEXT: [[GETELEMENTPTR13:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG]], i64 16
+; CHECK-NEXT: [[LOAD14:%.*]] = load ptr, ptr [[GETELEMENTPTR13]], align 8
+; CHECK-NEXT: [[LOAD28:%.*]] = load i64, ptr [[LOAD]], align 1
+; CHECK-NEXT: [[LOAD29:%.*]] = load i64, ptr [[LOAD14]], align 1
+; CHECK-NEXT: [[GETELEMENTPTR72:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD]], i64 8
+; CHECK-NEXT: [[LOAD73:%.*]] = load i64, ptr [[GETELEMENTPTR72]], align 1
+; CHECK-NEXT: [[GETELEMENTPTR75:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 8
+; CHECK-NEXT: [[LOAD76:%.*]] = load i64, ptr [[GETELEMENTPTR75]], align 1
+; CHECK-NEXT: [[GETELEMENTPTR120:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD]], i64 16
+; CHECK-NEXT: [[LOAD121:%.*]] = load i64, ptr [[GETELEMENTPTR120]], align 1
+; CHECK-NEXT: [[GETELEMENTPTR123:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 16
+; CHECK-NEXT: [[LOAD124:%.*]] = load i64, ptr [[GETELEMENTPTR123]], align 1
+; CHECK-NEXT: [[GETELEMENTPTR168:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD]], i64 24
+; CHECK-NEXT: [[LOAD169:%.*]] = load i64, ptr [[GETELEMENTPTR168]], align 1
+; CHECK-NEXT: [[GETELEMENTPTR171:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 24
+; CHECK-NEXT: [[LOAD172:%.*]] = load i32, ptr [[GETELEMENTPTR171]], align 1
+; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[LOAD172]] to i64
+; CHECK-NEXT: [[GETELEMENTPTR195:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 28
+; CHECK-NEXT: [[LOAD196:%.*]] = load i8, ptr [[GETELEMENTPTR195]], align 1
+; CHECK-NEXT: [[ALLOCA2_SROA_30_28_INSERT_EXT:%.*]] = zext i8 [[LOAD196]] to i64
+; CHECK-NEXT: [[ALLOCA2_SROA_30_28_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[ALLOCA2_SROA_30_28_INSERT_EXT]], 32
+; CHECK-NEXT: [[GETELEMENTPTR201:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 29
+; CHECK-NEXT: [[LOAD202:%.*]] = load i8, ptr [[GETELEMENTPTR201]], align 1
+; CHECK-NEXT: [[ALLOCA2_SROA_30_29_INSERT_EXT:%.*]] = zext i8 [[LOAD202]] to i64
+; CHECK-NEXT: [[ALLOCA2_SROA_30_29_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[ALLOCA2_SROA_30_29_INSERT_EXT]], 40
+; CHECK-NEXT: [[ALLOCA2_SROA_30_29_INSERT_MASK:%.*]] = or disjoint i64 [[TMP0]], [[ALLOCA2_SROA_30_28_INSERT_SHIFT]]
+; CHECK-NEXT: [[GETELEMENTPTR207:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 30
+; CHECK-NEXT: [[LOAD208:%.*]] = load i8, ptr [[GETELEMENTPTR207]], align 1
+; CHECK-NEXT: [[ALLOCA2_SROA_30_30_INSERT_EXT:%.*]] = zext i8 [[LOAD208]] to i64
+; CHECK-NEXT: [[ALLOCA2_SROA_30_30_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[ALLOCA2_SROA_30_30_INSERT_EXT]], 48
+; CHECK-NEXT: [[GETELEMENTPTR213:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 31
+; CHECK-NEXT: [[LOAD214:%.*]] = load i8, ptr [[GETELEMENTPTR213]], align 1
+; CHECK-NEXT: [[ALLOCA2_SROA_30_31_INSERT_EXT:%.*]] = zext i8 [[LOAD214]] to i64
+; CHECK-NEXT: [[ALLOCA2_SROA_30_31_INSERT_SHIFT:%.*]] = shl nuw i64 [[ALLOCA2_SROA_30_31_INSERT_EXT]], 56
+; CHECK-NEXT: [[ALLOCA2_SROA_30_30_INSERT_MASK_MASKED:%.*]] = or i64 [[ALLOCA2_SROA_30_29_INSERT_MASK]], [[ALLOCA2_SROA_30_29_INSERT_SHIFT]]
+; CHECK-NEXT: [[ALLOCA2_SROA_30_31_INSERT_MASK:%.*]] = or i64 [[ALLOCA2_SROA_30_30_INSERT_MASK_MASKED]], [[ALLOCA2_SROA_30_30_INSERT_SHIFT]]
+; CHECK-NEXT: [[ALLOCA2_SROA_30_31_INSERT_INSERT:%.*]] = or i64 [[ALLOCA2_SROA_30_31_INSERT_MASK]], [[ALLOCA2_SROA_30_31_INSERT_SHIFT]]
+; CHECK-NEXT: [[GETELEMENTPTR216:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG]], i64 40
+; CHECK-NEXT: [[LOAD217:%.*]] = load ptr, ptr [[GETELEMENTPTR216]], align 8
+; CHECK-NEXT: [[LOAD220:%.*]] = load <16 x float>, ptr [[LOAD217]], align 64
+; CHECK-NEXT: [[CALL:%.*]] = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD28]], i64 [[LOAD29]], <16 x float> [[LOAD220]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: store <16 x float> [[CALL]], ptr [[LOAD217]], align 64
+; CHECK-NEXT: [[CALL225:%.*]] = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD73]], i64 [[LOAD76]], <16 x float> [[CALL]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: store <16 x float> [[CALL225]], ptr [[LOAD217]], align 64
+; CHECK-NEXT: [[CALL230:%.*]] = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD121]], i64 [[LOAD124]], <16 x float> [[CALL225]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[CALL235:%.*]] = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD169]], i64 [[ALLOCA2_SROA_30_31_INSERT_INSERT]], <16 x float> [[CALL230]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: store <16 x float> [[CALL235]], ptr [[LOAD217]], align 64
+; CHECK-NEXT: ret void
+;
+bb:
+ %alloca = alloca %struct.eggs, align 1, addrspace(5)
+ %alloca1 = alloca %struct.pluto, align 32, addrspace(5)
+ %alloca2 = alloca %struct.pluto, align 32, addrspace(5)
+ %alloca3 = alloca %struct.snork, align 1, addrspace(5)
+ %alloca4 = alloca %struct.quux, align 8, addrspace(5)
+ %addrspacecast = addrspacecast ptr addrspace(5) %alloca to ptr
+ %addrspacecast5 = addrspacecast ptr addrspace(5) %alloca1 to ptr
+ %addrspacecast6 = addrspacecast ptr addrspace(5) %alloca2 to ptr
+ call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) %alloca1)
+ call void @llvm.memset.p5.i64(ptr addrspace(5) align 32 %alloca1, i8 0, i64 32, i1 false)
+ call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) %alloca2)
+ call void @llvm.memset.p5.i64(ptr addrspace(5) align 32 %alloca2, i8 0, i64 32, i1 false)
+ call void @llvm.lifetime.start.p5(i64 1, ptr addrspace(5) %alloca3)
+ store ptr %addrspacecast5, ptr addrspace(5) %alloca4, align 8
+ %getelementptr = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 1
+ %load = load ptr, ptr %arg, align 8
+ store ptr %load, ptr addrspace(5) %getelementptr, align 8
+ %getelementptr7 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 2
+ %getelementptr8 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 1
+ %load9 = load ptr, ptr %getelementptr8, align 8
+ store ptr %load9, ptr addrspace(5) %getelementptr7, align 8
+ %getelementptr10 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 3
+ store ptr %addrspacecast, ptr addrspace(5) %getelementptr10, align 8
+ %getelementptr11 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 4
+ store ptr %addrspacecast6, ptr addrspace(5) %getelementptr11, align 8
+ %getelementptr12 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 5
+ %getelementptr13 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 2
+ %load14 = load ptr, ptr %getelementptr13, align 8
+ store ptr %load14, ptr addrspace(5) %getelementptr12, align 8
+ %getelementptr15 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 6
+ %getelementptr16 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 3
+ %load17 = load ptr, ptr %getelementptr16, align 8
+ store ptr %load17, ptr addrspace(5) %getelementptr15, align 8
+ %getelementptr18 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 7
+ %getelementptr19 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 4
+ %load20 = load ptr, ptr %getelementptr19, align 8
+ store ptr %load20, ptr addrspace(5) %getelementptr18, align 8
+ %load21 = load ptr, ptr addrspace(5) %alloca4, align 8
+ %getelementptr22 = getelementptr inbounds i8, ptr addrspace(5) %alloca4, i32 8
+ %load23 = load ptr, ptr addrspace(5) %getelementptr22, align 8
+ %getelementptr24 = getelementptr inbounds i8, ptr addrspace(5) %alloca4, i32 32
+ %load25 = load ptr, ptr addrspace(5) %getelementptr24, align 8
+ %getelementptr26 = getelementptr inbounds i8, ptr addrspace(5) %alloca4, i32 40
+ %load27 = load ptr, ptr addrspace(5) %getelementptr26, align 8
+ %load28 = load i8, ptr %load23, align 1
+ store i8 %load28, ptr %load21, align 1
+ %load29 = load i8, ptr %load27, align 1
+ store i8 %load29, ptr %load25, align 1
+ %getelementptr30 = getelementptr inbounds i8, ptr %load23, i64 1
+ %load31 = load i8, ptr %getelementptr30, align 1
+ %getelementptr32 = getelementptr inbounds i8, ptr %load21, i64 1
+ store i8 %load31, ptr %getelementptr32, align 1
+ %getelementptr33 = getelementptr inbounds i8, ptr %load27, i64 1
+ %load34 = load i8, ptr %getelementptr33, align 1
+ %getelementptr35 = getelementptr inbounds i8, ptr %load25, i64 1
+ store i8 %load34, ptr %getelementptr35, align 1
+ %getelementptr36 = getelementptr inbounds i8, ptr %load23, i64 2
+ %load37 = load i8, ptr %getelementptr36, align 1
+ %getelementptr38 = getelementptr inbounds i8, ptr %load21, i64 2
+ store i8 %load37, ptr %getelementptr38, align 1
+ %getelementptr39 = getelementptr inbounds i8, ptr %load27, i64 2
+ %load40 = load i8, ptr %getelementptr39, align 1
+ %getelementptr41 = getelementptr inbounds i8, ptr %load25, i64 2
+ store i8 %load40, ptr %getelementptr41, align 1
+ %getelementptr42 = getelementptr inbounds i8, ptr %load23, i64 3
+ %load43 = load i8, ptr %getelementptr42, align 1
+ %getelementptr44 = getelementptr inbounds i8, ptr %load21, i64 3
+ store i8 %load43, ptr %getelementptr44, align 1
+ %getelementptr45 = getelementptr inbounds i8, ptr %load27, i64 3
+ %load46 = load i8, ptr %getelementptr45, align 1
+ %getelementptr47 = getelementptr inbounds i8, ptr %load25, i64 3
+ store i8 %load46, ptr %getelementptr47, align 1
+ %getelementptr48 = getelementptr inbounds i8, ptr %load23, i64 4
+ %load49 = load i8, ptr %getelementptr48, align 1
+ %getelementptr50 = getelementptr inbounds i8, ptr %load21, i64 4
+ store i8 %load49, ptr %getelementptr50, align 1
+ %getelementptr51 = getelementptr inbounds i8, ptr %load27, i64 4
+ %load52 = load i8, ptr %getelementptr51, align 1
+ %getelementptr53 = getelementptr inbounds i8, ptr %load25, i64 4
+ store i8 %load52, ptr %getelementptr53, align 1
+ %getelementptr54 = getelementptr inbounds i8, ptr %load23, i64 5
+ %load55 = load i8, ptr %getelementptr54, align 1
+ %getelementptr56 = getelementptr inbounds i8, ptr %load21, i64 5
+ store i8 %load55, ptr %getelementptr56, align 1
+ %getelementptr57 = getelementptr inbounds i8, ptr %load27, i64 5
+ %load58 = load i8, ptr %getelementptr57, align 1
+ %getelementptr59 = getelementptr inbounds i8, ptr %load25, i64 5
+ store i8 %load58, ptr %getelementptr59, align 1
+ %getelementptr60 = getelementptr inbounds i8, ptr %load23, i64 6
+ %load61 = load i8, ptr %getelementptr60, align 1
+ %getelementptr62 = getelementptr inbounds i8, ptr %load21, i64 6
+ store i8 %load61, ptr %getelementptr62, align 1
+ %getelementptr63 = getelementptr inbounds i8, ptr %load27, i64 6
+ %load64 = load i8, ptr %getelementptr63, align 1
+ %getelementptr65 = getelementptr inbounds i8, ptr %load25, i64 6
+ store i8 %load64, ptr %getelementptr65, align 1
+ %getelementptr66 = getelementptr inbounds i8, ptr %load23, i64 7
+ %load67 = load i8, ptr %getelementptr66, align 1
+ %getelementptr68 = getelementptr inbounds i8, ptr %load21, i64 7
+ store i8 %load67, ptr %getelementptr68, align 1
+ %getelementptr69 = getelementptr inbounds i8, ptr %load27, i64 7
+ %load70 = load i8, ptr %getelementptr69, align 1
+ %getelementptr71 = getelementptr inbounds i8, ptr %load25, i64 7
+ store i8 %load70, ptr %getelementptr71, align 1
+ %getelementptr72 = getelementptr inbounds i8, ptr %load23, i64 8
+ %load73 = load i8, ptr %getelementptr72, align 1
+ %getelementptr74 = getelementptr inbounds i8, ptr %load21, i64 8
+ store i8 %load73, ptr %getelementptr74, align 1
+ %getelementptr75 = getelementptr inbounds i8, ptr %load27, i64 8
+ %load76 = load i8, ptr %getelementptr75, align 1
+ %getelementptr77 = getelementptr inbounds i8, ptr %load25, i64 8
+ store i8 %load76, ptr %getelementptr77, align 1
+ %getelementptr78 = getelementptr inbounds i8, ptr %load23, i64 9
+ %load79 = load i8, ptr %getelementptr78, align 1
+ %getelementptr80 = getelementptr inbounds i8, ptr %load21, i64 9
+ store i8 %load79, ptr %getelementptr80, align 1
+ %getelementptr81 = getelementptr inbounds i8, ptr %load27, i64 9
+ %load82 = load i8, ptr %getelementptr81, align 1
+ %getelementptr83 = getelementptr inbounds i8, ptr %load25, i64 9
+ store i8 %load82, ptr %getelementptr83, align 1
+ %getelementptr84 = getelementptr inbounds i8, ptr %load23, i64 10
+ %load85 = load i8, ptr %getelementptr84, align 1
+ %getelementptr86 = getelementptr inbounds i8, ptr %load21, i64 10
+ store i8 %load85, ptr %getelementptr86, align 1
+ %getelementptr87 = getelementptr inbounds i8, ptr %load27, i64 10
+ %load88 = load i8, ptr %getelementptr87, align 1
+ %getelementptr89 = getelementptr inbounds i8, ptr %load25, i64 10
+ store i8 %load88, ptr %getelementptr89, align 1
+ %getelementptr90 = getelementptr inbounds i8, ptr %load23, i64 11
+ %load91 = load i8, ptr %getelementptr90, align 1
+ %getelementptr92 = getelementptr inbounds i8, ptr %load21, i64 11
+ store i8 %load91, ptr %getelementptr92, align 1
+ %getelementptr93 = getelementptr inbounds i8, ptr %load27, i64 11
+ %load94 = load i8, ptr %getelementptr93, align 1
+ %getelementptr95 = getelementptr inbounds i8, ptr %load25, i64 11
+ store i8 %load94, ptr %getelementptr95, align 1
+ %getelementptr96 = getelementptr inbounds i8, ptr %load23, i64 12
+ %load97 = load i8, ptr %getelementptr96, align 1
+ %getelementptr98 = getelementptr inbounds i8, ptr %load21, i64 12
+ store i8 %load97, ptr %getelementptr98, align 1
+ %getelementptr99 = getelementptr inbounds i8, ptr %load27, i64 12
+ %load100 = load i8, ptr %getelementptr99, align 1
+ %getelementptr101 = getelementptr inbounds i8, ptr %load25, i64 12
+ store i8 %load100, ptr %getelementptr101, align 1
+ %getelementptr102 = getelementptr inbounds i8, ptr %load23, i64 13
+ %load103 = load i8, ptr %getelementptr102, align 1
+ %getelementptr104 = getelementptr inbounds i8, ptr %load21, i64 13
+ store i8 %load103, ptr %getelementptr104, align 1
+ %getelementptr105 = getelementptr inbounds i8, ptr %load27, i64 13
+ %load106 = load i8, ptr %getelementptr105, align 1
+ %getelementptr107 = getelementptr inbounds i8, ptr %load25, i64 13
+ store i8 %load106, ptr %getelementptr107, align 1
+ %getelementptr108 = getelementptr inbounds i8, ptr %load23, i64 14
+ %load109 = load i8, ptr %getelementptr108, align 1
+ %getelementptr110 = getelementptr inbounds i8, ptr %load21, i64 14
+ store i8 %load109, ptr %getelementptr110, align 1
+ %getelementptr111 = getelementptr inbounds i8, ptr %load27, i64 14
+ %load112 = load i8, ptr %getelementptr111, align 1
+ %getelementptr113 = getelementptr inbounds i8, ptr %load25, i64 14
+ store i8 %load112, ptr %getelementptr113, align 1
+ %getelementptr114 = getelementptr inbounds i8, ptr %load23, i64 15
+ %load115 = load i8, ptr %getelementptr114, align 1
+ %getelementptr116 = getelementptr inbounds i8, ptr %load21, i64 15
+ store i8 %load115, ptr %getelementptr116, align 1
+ %getelementptr117 = getelementptr inbounds i8, ptr %load27, i64 15
+ %load118 = load i8, ptr %getelementptr117, align 1
+ %getelementptr119 = getelementptr inbounds i8, ptr %load25, i64 15
+ store i8 %load118, ptr %getelementptr119, align 1
+ %getelementptr120 = getelementptr inbounds i8, ptr %load23, i64 16
+ %load121 = load i8, ptr %getelementptr120, align 1
+ %getelementptr122 = getelementptr inbounds i8, ptr %load21, i64 16
+ store i8 %load121, ptr %getelementptr122, align 1
+ %getelementptr123 = getelementptr inbounds i8, ptr %load27, i64 16
+ %load124 = load i8, ptr %getelementptr123, align 1
+ %getelementptr125 = getelementptr inbounds i8, ptr %load25, i64 16
+ store i8 %load124, ptr %getelementptr125, align 1
+ %getelementptr126 = getelementptr inbounds i8, ptr %load23, i64 17
+ %load127 = load i8, ptr %getelementptr126, align 1
+ %getelementptr128 = getelementptr inbounds i8, ptr %load21, i64 17
+ store i8 %load127, ptr %getelementptr128, align 1
+ %getelementptr129 = getelementptr inbounds i8, ptr %load27, i64 17
+ %load130 = load i8, ptr %getelementptr129, align 1
+ %getelementptr131 = getelementptr inbounds i8, ptr %load25, i64 17
+ store i8 %load130, ptr %getelementptr131, align 1
+ %getelementptr132 = getelementptr inbounds i8, ptr %load23, i64 18
+ %load133 = load i8, ptr %getelementptr132, align 1
+ %getelementptr134 = getelementptr inbounds i8, ptr %load21, i64 18
+ store i8 %load133, ptr %getelementptr134, align 1
+ %getelementptr135 = getelementptr inbounds i8, ptr %load27, i64 18
+ %load136 = load i8, ptr %getelementptr135, align 1
+ %getelementptr137 = getelementptr inbounds i8, ptr %load25, i64 18
+ store i8 %load136, ptr %getelementptr137, align 1
+ %getelementptr138 = getelementptr inbounds i8, ptr %load23, i64 19
+ %load139 = load i8, ptr %getelementptr138, align 1
+ %getelementptr140 = getelementptr inbounds i8, ptr %load21, i64 19
+ store i8 %load139, ptr %getelementptr140, align 1
+ %getelementptr141 = getelementptr inbounds i8, ptr %load27, i64 19
+ %load142 = load i8, ptr %getelementptr141, align 1
+ %getelementptr143 = getelementptr inbounds i8, ptr %load25, i64 19
+ store i8 %load142, ptr %getelementptr143, align 1
+ %getelementptr144 = getelementptr inbounds i8, ptr %load23, i64 20
+ %load145 = load i8, ptr %getelementptr144, align 1
+ %getelementptr146 = getelementptr inbounds i8, ptr %load21, i64 20
+ store i8 %load145, ptr %getelementptr146, align 1
+ %getelementptr147 = getelementptr inbounds i8, ptr %load27, i64 20
+ %load148 = load i8, ptr %getelementptr147, align 1
+ %getelementptr149 = getelementptr inbounds i8, ptr %load25, i64 20
+ store i8 %load148, ptr %getelementptr149, align 1
+ %getelementptr150 = getelementptr inbounds i8, ptr %load23, i64 21
+ %load151 = load i8, ptr %getelementptr150, align 1
+ %getelementptr152 = getelementptr inbounds i8, ptr %load21, i64 21
+ store i8 %load151, ptr %getelementptr152, align 1
+ %getelementptr153 = getelementptr inbounds i8, ptr %load27, i64 21
+ %load154 = load i8, ptr %getelementptr153, align 1
+ %getelementptr155 = getelementptr inbounds i8, ptr %load25, i64 21
+ store i8 %load154, ptr %getelementptr155, align 1
+ %getelementptr156 = getelementptr inbounds i8, ptr %load23, i64 22
+ %load157 = load i8, ptr %getelementptr156, align 1
+ %getelementptr158 = getelementptr inbounds i8, ptr %load21, i64 22
+ store i8 %load157, ptr %getelementptr158, align 1
+ %getelementptr159 = getelementptr inbounds i8, ptr %load27, i64 22
+ %load160 = load i8, ptr %getelementptr159, align 1
+ %getelementptr161 = getelementptr inbounds i8, ptr %load25, i64 22
+ store i8 %load160, ptr %getelementptr161, align 1
+ %getelementptr162 = getelementptr inbounds i8, ptr %load23, i64 23
+ %load163 = load i8, ptr %getelementptr162, align 1
+ %getelementptr164 = getelementptr inbounds i8, ptr %load21, i64 23
+ store i8 %load163, ptr %getelementptr164, align 1
+ %getelementptr165 = getelementptr inbounds i8, ptr %load27, i64 23
+ %load166 = load i8, ptr %getelementptr165, align 1
+ %getelementptr167 = getelementptr inbounds i8, ptr %load25, i64 23
+ store i8 %load166, ptr %getelementptr167, align 1
+ %getelementptr168 = getelementptr inbounds i8, ptr %load23, i64 24
+ %load169 = load i8, ptr %getelementptr168, align 1
+ %getelementptr170 = getelementptr inbounds i8, ptr %load21, i64 24
+ store i8 %load169, ptr %getelementptr170, align 1
+ %getelementptr171 = getelementptr inbounds i8, ptr %load27, i64 24
+ %load172 = load i8, ptr %getelementptr171, align 1
+ %getelementptr173 = getelementptr inbounds i8, ptr %load25, i64 24
+ store i8 %load172, ptr %getelementptr173, align 1
+ %getelementptr174 = getelementptr inbounds i8, ptr %load23, i64 25
+ %load175 = load i8, ptr %getelementptr174, align 1
+ %getelementptr176 = getelementptr inbounds i8, ptr %load21, i64 25
+ store i8 %load175, ptr %getelementptr176, align 1
+ %getelementptr177 = getelementptr inbounds i8, ptr %load27, i64 25
+ %load178 = load i8, ptr %getelementptr177, align 1
+ %getelementptr179 = getelementptr inbounds i8, ptr %load25, i64 25
+ store i8 %load178, ptr %getelementptr179, align 1
+ %getelementptr180 = getelementptr inbounds i8, ptr %load23, i64 26
+ %load181 = load i8, ptr %getelementptr180, align 1
+ %getelementptr182 = getelementptr inbounds i8, ptr %load21, i64 26
+ store i8 %load181, ptr %getelementptr182, align 1
+ %getelementptr183 = getelementptr inbounds i8, ptr %load27, i64 26
+ %load184 = load i8, ptr %getelementptr183, align 1
+ %getelementptr185 = getelementptr inbounds i8, ptr %load25, i64 26
+ store i8 %load184, ptr %getelementptr185, align 1
+ %getelementptr186 = getelementptr inbounds i8, ptr %load23, i64 27
+ %load187 = load i8, ptr %getelementptr186, align 1
+ %getelementptr188 = getelementptr inbounds i8, ptr %load21, i64 27
+ store i8 %load187, ptr %getelementptr188, align 1
+ %getelementptr189 = getelementptr inbounds i8, ptr %load27, i64 27
+ %load190 = load i8, ptr %getelementptr189, align 1
+ %getelementptr191 = getelementptr inbounds i8, ptr %load25, i64 27
+ store i8 %load190, ptr %getelementptr191, align 1
+ %getelementptr192 = getelementptr inbounds i8, ptr %load23, i64 28
+ %load193 = load i8, ptr %getelementptr192, align 1
+ %getelementptr194 = getelementptr inbounds i8, ptr %load21, i64 28
+ store i8 %load193, ptr %getelementptr194, align 1
+ %getelementptr195 = getelementptr inbounds i8, ptr %load27, i64 28
+ %load196 = load i8, ptr %getelementptr195, align 1
+ %getelementptr197 = getelementptr inbounds i8, ptr %load25, i64 28
+ store i8 %load196, ptr %getelementptr197, align 1
+ %getelementptr198 = getelementptr inbounds i8, ptr %load23, i64 29
+ %load199 = load i8, ptr %getelementptr198, align 1
+ %getelementptr200 = getelementptr inbounds i8, ptr %load21, i64 29
+ store i8 %load199, ptr %getelementptr200, align 1
+ %getelementptr201 = getelementptr inbounds i8, ptr %load27, i64 29
+ %load202 = load i8, ptr %getelementptr201, align 1
+ %getelementptr203 = getelementptr inbounds i8, ptr %load25, i64 29
+ store i8 %load202, ptr %getelementptr203, align 1
+ %getelementptr204 = getelementptr inbounds i8, ptr %load23, i64 30
+ %load205 = load i8, ptr %getelementptr204, align 1
+ %getelementptr206 = getelementptr inbounds i8, ptr %load21, i64 30
+ store i8 %load205, ptr %getelementptr206, align 1
+ %getelementptr207 = getelementptr inbounds i8, ptr %load27, i64 30
+ %load208 = load i8, ptr %getelementptr207, align 1
+ %getelementptr209 = getelementptr inbounds i8, ptr %load25, i64 30
+ store i8 %load208, ptr %getelementptr209, align 1
+ %getelementptr210 = getelementptr inbounds i8, ptr %load23, i64 31
+ %load211 = load i8, ptr %getelementptr210, align 1
+ %getelementptr212 = getelementptr inbounds i8, ptr %load21, i64 31
+ store i8 %load211, ptr %getelementptr212, align 1
+ %getelementptr213 = getelementptr inbounds i8, ptr %load27, i64 31
+ %load214 = load i8, ptr %getelementptr213, align 1
+ %getelementptr215 = getelementptr inbounds i8, ptr %load25, i64 31
+ store i8 %load214, ptr %getelementptr215, align 1
+ call void @llvm.lifetime.end.p5(i64 1, ptr addrspace(5) %alloca3)
+ %getelementptr216 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 5
+ %load217 = load ptr, ptr %getelementptr216, align 8
+ %load218 = load i64, ptr addrspace(5) %alloca1, align 8
+ %load219 = load i64, ptr addrspace(5) %alloca2, align 8
+ %load220 = load <16 x float>, ptr %load217, align 64
+ %call = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %load218, i64 %load219, <16 x float> %load220, i32 0, i32 0, i32 0)
+ store <16 x float> %call, ptr %load217, align 64
+ %getelementptr221 = getelementptr inbounds i8, ptr addrspace(5) %alloca1, i64 8
+ %getelementptr222 = getelementptr inbounds i8, ptr addrspace(5) %alloca2, i64 8
+ %load223 = load i64, ptr addrspace(5) %getelementptr221, align 8
+ %load224 = load i64, ptr addrspace(5) %getelementptr222, align 8
+ %call225 = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %load223, i64 %load224, <16 x float> %call, i32 0, i32 0, i32 0)
+ store <16 x float> %call225, ptr %load217, align 64
+ %getelementptr226 = getelementptr inbounds i8, ptr addrspace(5) %alloca1, i64 16
+ %getelementptr227 = getelementptr inbounds i8, ptr addrspace(5) %alloca2, i64 16
+ %load228 = load i64, ptr addrspace(5) %getelementptr226, align 8
+ %load229 = load i64, ptr addrspace(5) %getelementptr227, align 8
+ %call230 = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %load228, i64 %load229, <16 x float> %call225, i32 0, i32 0, i32 0)
+ store <16 x float> %call230, ptr %load217, align 64
+ %getelementptr231 = getelementptr inbounds i8, ptr addrspace(5) %alloca1, i64 24
+ %getelementptr232 = getelementptr inbounds i8, ptr addrspace(5) %alloca2, i64 24
+ %load233 = load i64, ptr addrspace(5) %getelementptr231, align 8
+ %load234 = load i64, ptr addrspace(5) %getelementptr232, align 8
+ %call235 = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %load233, i64 %load234, <16 x float> %call230, i32 0, i32 0, i32 0)
+ store <16 x float> %call235, ptr %load217, align 64
+ call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) %alloca2)
+ call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) %alloca1)
+ ret void
+}
+
+; Function Attrs: convergent nocallback nofree nosync nounwind willreturn memory(none)
+declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64, i64, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #0
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) captures(none)) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) captures(none)) #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p5.i64(ptr addrspace(5) writeonly captures(none), i8, i64, i1 immarg) #2
+
+attributes #0 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
index 37a261cab7563..e8bd640aa5409 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
@@ -7,23 +7,25 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
; MUBUF-LABEL: memcpy_fixed_align:
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MUBUF-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
+; MUBUF-NEXT: global_load_dwordx4 v[11:14], v[1:2], off offset:24
; MUBUF-NEXT: s_lshr_b32 s4, s32, 6
; MUBUF-NEXT: s_waitcnt vmcnt(2)
-; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32
-; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36
-; MUBUF-NEXT: s_waitcnt vmcnt(3)
; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:12
; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:8
; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4
; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32
-; MUBUF-NEXT: s_waitcnt vmcnt(6)
+; MUBUF-NEXT: s_waitcnt vmcnt(5)
; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28
; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24
; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20
; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:16
+; MUBUF-NEXT: s_waitcnt vmcnt(8)
+; MUBUF-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:36
+; MUBUF-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:32
+; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:28
+; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:24
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; use s4
; MUBUF-NEXT: ;;#ASMEND
@@ -35,14 +37,14 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
-; FLATSCR-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
+; FLATSCR-NEXT: global_load_dwordx4 v[11:14], v[1:2], off offset:24
; FLATSCR-NEXT: s_mov_b32 s0, s32
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s32 offset:16
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[11:14], s32 offset:24
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; use s0
; FLATSCR-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 0003366f3a3ea..5b7c36559a366 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -12,21 +12,19 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v12, s3
-; CHECK-NEXT: v_mov_b32_e32 v11, s2
-; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
-; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
-; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
-; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
-; CHECK-NEXT: v_mov_b32_e32 v12, s1
-; CHECK-NEXT: v_mov_b32_e32 v11, s0
+; CHECK-NEXT: v_mov_b32_e32 v9, s3
+; CHECK-NEXT: v_mov_b32_e32 v8, s2
+; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
+; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
+; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
+; CHECK-NEXT: v_mov_b32_e32 v9, s1
+; CHECK-NEXT: v_mov_b32_e32 v8, s0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
-; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
-; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
-; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
-; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
+; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
+; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -173,33 +171,33 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: v_mov_b32_e32 v26, s0
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:32
-; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:36
-; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:40
-; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:44
-; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:48
-; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:52
-; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:56
-; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:60
-; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:76
-; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
-; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:80
+; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:76
+; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
+; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
+; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
+; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:44
+; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:48
+; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:52
+; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:56
+; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:60
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
-; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
@@ -213,10 +211,10 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:64
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:48
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:32
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:16
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
@@ -281,8 +279,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
-; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
-; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
+; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64
@@ -302,21 +300,19 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v12, s3
-; CHECK-NEXT: v_mov_b32_e32 v11, s2
-; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46
-; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44
-; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32
-; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12]
-; CHECK-NEXT: v_mov_b32_e32 v12, s1
-; CHECK-NEXT: v_mov_b32_e32 v11, s0
+; CHECK-NEXT: v_mov_b32_e32 v9, s3
+; CHECK-NEXT: v_mov_b32_e32 v8, s2
+; CHECK-NEXT: flat_load_dwordx2 v[10:11], v[8:9] offset:32
+; CHECK-NEXT: flat_load_dwordx2 v[12:13], v[8:9] offset:39
+; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[8:9] offset:16
+; CHECK-NEXT: v_mov_b32_e32 v9, s1
+; CHECK-NEXT: v_mov_b32_e32 v8, s0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46
-; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44
-; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32
-; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16
-; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
+; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[10:11] offset:32
+; CHECK-NEXT: flat_store_dwordx2 v[8:9], v[12:13] offset:39
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[4:7] offset:16
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -463,33 +459,33 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: v_mov_b32_e32 v26, s0
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:32
-; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:36
-; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:40
-; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:44
-; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:48
-; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:52
-; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:56
-; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:60
-; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:76
-; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
-; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:80
+; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:76
+; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:32
+; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:36
+; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:40
+; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:44
+; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:48
+; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:52
+; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:56
+; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:60
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
-; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
@@ -503,10 +499,10 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:12
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:64
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:48
-; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:32
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:80
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:32
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:16
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
@@ -571,8 +567,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32
; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11
-; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13
-; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
+; CHECK-NEXT: ds_read_b128 v[8:11], v16 offset:96
+; CHECK-NEXT: ds_read_b128 v[16:19], v16 offset:112
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
index b43ccc551ca95..048610184368d 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll
@@ -27,19 +27,16 @@ define void @memcpy_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p0_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -83,19 +80,16 @@ define void @memcpy_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p0_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30
-; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28
-; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
+; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -239,19 +233,16 @@ define void @memcpy_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p1_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -295,19 +286,16 @@ define void @memcpy_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p1_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30
-; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28
-; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16
-; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:23
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -437,7 +425,7 @@ define void @memcpy_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz16_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@@ -451,19 +439,15 @@ define void @memcpy_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read_b32 v8, v2 offset:24
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: ds_read_b64 v[7:8], v2 offset:23
+; CHECK-NEXT: ds_read_b128 v[3:6], v2
+; CHECK-NEXT: ds_read_b64 v[9:10], v2 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -475,8 +459,8 @@ define void @memcpy_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz32_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
-; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT: ds_read_b128 v[7:10], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
@@ -492,7 +476,7 @@ define void @memcpy_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz16_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@@ -506,19 +490,15 @@ define void @memcpy_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read_b32 v8, v2 offset:24
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
-; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
-; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT: ds_read_b64 v[7:8], v2 offset:23
+; CHECK-NEXT: ds_read_b128 v[3:6], v2
+; CHECK-NEXT: ds_read_b64 v[9:10], v2 offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt lgkmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -530,8 +510,8 @@ define void @memcpy_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p3_sz32_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
-; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT: ds_read_b128 v[7:10], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
@@ -643,12 +623,9 @@ define void @memcpy_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz16_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:8
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -660,24 +637,16 @@ define void @memcpy_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16
-; CHECK-NEXT: global_load_dword v4, v[2:3], off offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dword v[0:1], v4 offset:24
-; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:28
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:30
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:8
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:30
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -689,18 +658,13 @@ define void @memcpy_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz32_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16
-; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:24
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:24
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -712,12 +676,9 @@ define void @memcpy_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz16_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8
+; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:8
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -729,24 +690,16 @@ define void @memcpy_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16
-; CHECK-NEXT: global_load_dword v4, v[2:3], off offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dword v[0:1], v4 offset:24
-; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:28
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[8:9]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28
-; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:30
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:8
+; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:30
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -758,18 +711,13 @@ define void @memcpy_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p4_sz32_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8
-; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16
-; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:24
+; CHECK-NEXT: s_clause 0x1
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(1)
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:24
+; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -895,22 +843,20 @@ define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p5_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -964,22 +910,20 @@ define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p0_p5_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: s_clause 0x7
; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: flat_store_short v[0:1], v11 offset:28
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:30
-; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(2)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[7:8] offset:23
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[9:10] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -1161,15 +1105,15 @@ define void @memcpy_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x2
-; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] offset:23
-; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:23
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -1211,15 +1155,15 @@ define void @memcpy_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x2
-; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] offset:23
-; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:16
-; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3]
+; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:23
+; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3]
+; CHECK-NEXT: flat_load_dwordx2 v[2:3], v[2:3] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:23
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off offset:16
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -1929,18 +1873,18 @@ define void @memcpy_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x7
-; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -1994,18 +1938,18 @@ define void @memcpy_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x7
-; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23
; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27
-; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -3267,19 +3211,16 @@ define void @memcpy_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p0_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30
-; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
@@ -3334,19 +3275,16 @@ define void @memcpy_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p0_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30
-; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28
-; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23
+; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16
; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2]
-; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
-; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30
; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
-; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
@@ -3525,24 +3463,21 @@ define void @memcpy_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p1_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -3592,24 +3527,21 @@ define void @memcpy_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p1_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -3783,25 +3715,20 @@ define void @memcpy_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p3_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read_b32 v8, v1 offset:24
-; CHECK-NEXT: ds_read_u16 v9, v1 offset:28
-; CHECK-NEXT: ds_read_u8 v10, v1 offset:30
; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1
; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: ds_read_b64 v[8:9], v1 offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
-; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -3850,25 +3777,20 @@ define void @memcpy_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p3_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read_b32 v8, v1 offset:24
-; CHECK-NEXT: ds_read_u16 v9, v1 offset:28
-; CHECK-NEXT: ds_read_u8 v10, v1 offset:30
; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1
; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16
-; CHECK-NEXT: s_waitcnt lgkmcnt(4)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24
-; CHECK-NEXT: s_waitcnt lgkmcnt(3)
-; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: ds_read_b64 v[8:9], v1 offset:23
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
-; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30
-; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4037,24 +3959,21 @@ define void @memcpy_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p4_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -4104,24 +4023,21 @@ define void @memcpy_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p4_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x3
-; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16
-; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28
-; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30
-; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off
-; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; CHECK-NEXT: global_load_dwordx2 v[7:8], v[1:2], off offset:16
+; CHECK-NEXT: global_load_dwordx2 v[1:2], v[1:2], off offset:23
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:27
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false)
@@ -4302,34 +4218,31 @@ define void @memcpy_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p5_sz31_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_waitcnt vmcnt(7)
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(6)
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -4398,34 +4311,31 @@ define void @memcpy_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr
; CHECK-LABEL: memcpy_p5_p5_sz31_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:23
+; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:27
; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16
; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20
; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8
; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12
; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(8)
-; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_waitcnt vmcnt(7)
-; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:23
; CHECK-NEXT: s_waitcnt vmcnt(6)
-; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:27
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20
; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
+; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
index f08ea27040fb5..01b7f40f6256f 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll
@@ -471,7 +471,7 @@ define void @memmove_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p0_p3_sz16_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@@ -489,7 +489,7 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
@@ -509,8 +509,8 @@ define void @memmove_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add
; CHECK-LABEL: memmove_p0_p3_sz32_align_1_1:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
-; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT: ds_read_b128 v[7:10], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
@@ -526,7 +526,7 @@ define void @memmove_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p0_p3_sz16_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
@@ -544,7 +544,7 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
; CHECK-NEXT: ds_read_u8 v9, v2 offset:30
; CHECK-NEXT: ds_read_u16 v10, v2 offset:28
; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16
-; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[2:5], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
@@ -564,8 +564,8 @@ define void @memmove_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add
; CHECK-LABEL: memmove_p0_p3_sz32_align_2_2:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3
-; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1
+; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:16
+; CHECK-NEXT: ds_read_b128 v[7:10], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
@@ -2077,21 +2077,23 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: global_store_dword v[0:1], v10, off offset:24
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false)
@@ -2143,21 +2145,23 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_clause 0x8
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30
-; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen
-; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:30
+; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen
+; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:24
; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT: s_waitcnt vmcnt(4)
+; CHECK-NEXT: global_store_dword v[0:1], v10, off offset:24
; CHECK-NEXT: s_waitcnt vmcnt(3)
; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28
-; CHECK-NEXT: global_store_byte v[0:1], v10, off offset:30
-; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off
+; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30
+; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16
+; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off offset:8
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false)
>From 4b11ea18d42299a4fee952e9040b79d60d3accf8 Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Thu, 11 Sep 2025 18:59:56 -0500
Subject: [PATCH 2/4] fixup! [AMDGPU] Enable more consecutive load folding
during aggressive-instcombine
---
.../AggressiveInstCombine}/fold-consecutive-loads.ll | 0
1 file changed, 0 insertions(+), 0 deletions(-)
rename llvm/test/{CodeGen/AMDGPU => Transforms/AggressiveInstCombine}/fold-consecutive-loads.ll (100%)
diff --git a/llvm/test/CodeGen/AMDGPU/fold-consecutive-loads.ll b/llvm/test/Transforms/AggressiveInstCombine/fold-consecutive-loads.ll
similarity index 100%
rename from llvm/test/CodeGen/AMDGPU/fold-consecutive-loads.ll
rename to llvm/test/Transforms/AggressiveInstCombine/fold-consecutive-loads.ll
>From 881a364ba3f8500f13d517767c41935c76bb0820 Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Thu, 11 Sep 2025 19:02:58 -0500
Subject: [PATCH 3/4] fixup! [AMDGPU] Enable more consecutive load folding
during aggressive-instcombine
---
.../fold-consecutive-loads.ll | 495 +++++-------------
1 file changed, 138 insertions(+), 357 deletions(-)
diff --git a/llvm/test/Transforms/AggressiveInstCombine/fold-consecutive-loads.ll b/llvm/test/Transforms/AggressiveInstCombine/fold-consecutive-loads.ll
index 610760f788ea8..8fe242598ef0c 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/fold-consecutive-loads.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/fold-consecutive-loads.ll
@@ -51,407 +51,188 @@ bb:
; The following test case reduced from a client kernel
-%struct.eggs = type { i8 }
-%struct.pluto = type { %struct.spam }
-%struct.spam = type { <32 x i8> }
-%struct.snork = type { i8 }
-%struct.quux = type { ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr }
-%struct.bar = type { ptr, ptr, ptr, ptr, ptr, ptr }
-
-define fastcc void @hoge(ptr noundef nonnull readonly align 8 captures(none) dereferenceable(48) %arg) {
-; CHECK-LABEL: define fastcc void @hoge(
-; CHECK-SAME: ptr noundef nonnull readonly align 8 captures(none) dereferenceable(48) [[ARG:%.*]]) #[[ATTR0]] {
+define fastcc <16 x float> @hoge(ptr %arg) {
+; CHECK-LABEL: define fastcc <16 x float> @hoge(
+; CHECK-SAME: ptr [[ARG:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[LOAD:%.*]] = load ptr, ptr [[ARG]], align 8
-; CHECK-NEXT: [[GETELEMENTPTR13:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG]], i64 16
-; CHECK-NEXT: [[LOAD14:%.*]] = load ptr, ptr [[GETELEMENTPTR13]], align 8
; CHECK-NEXT: [[LOAD28:%.*]] = load i64, ptr [[LOAD]], align 1
-; CHECK-NEXT: [[LOAD29:%.*]] = load i64, ptr [[LOAD14]], align 1
-; CHECK-NEXT: [[GETELEMENTPTR72:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD]], i64 8
+; CHECK-NEXT: [[GETELEMENTPTR72:%.*]] = getelementptr i8, ptr [[LOAD]], i64 8
; CHECK-NEXT: [[LOAD73:%.*]] = load i64, ptr [[GETELEMENTPTR72]], align 1
-; CHECK-NEXT: [[GETELEMENTPTR75:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 8
-; CHECK-NEXT: [[LOAD76:%.*]] = load i64, ptr [[GETELEMENTPTR75]], align 1
-; CHECK-NEXT: [[GETELEMENTPTR120:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD]], i64 16
+; CHECK-NEXT: [[GETELEMENTPTR120:%.*]] = getelementptr i8, ptr [[LOAD]], i64 16
; CHECK-NEXT: [[LOAD121:%.*]] = load i64, ptr [[GETELEMENTPTR120]], align 1
-; CHECK-NEXT: [[GETELEMENTPTR123:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 16
-; CHECK-NEXT: [[LOAD124:%.*]] = load i64, ptr [[GETELEMENTPTR123]], align 1
-; CHECK-NEXT: [[GETELEMENTPTR168:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD]], i64 24
+; CHECK-NEXT: [[GETELEMENTPTR168:%.*]] = getelementptr i8, ptr [[LOAD]], i64 24
; CHECK-NEXT: [[LOAD169:%.*]] = load i64, ptr [[GETELEMENTPTR168]], align 1
-; CHECK-NEXT: [[GETELEMENTPTR171:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 24
-; CHECK-NEXT: [[LOAD172:%.*]] = load i32, ptr [[GETELEMENTPTR171]], align 1
-; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[LOAD172]] to i64
-; CHECK-NEXT: [[GETELEMENTPTR195:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 28
-; CHECK-NEXT: [[LOAD196:%.*]] = load i8, ptr [[GETELEMENTPTR195]], align 1
-; CHECK-NEXT: [[ALLOCA2_SROA_30_28_INSERT_EXT:%.*]] = zext i8 [[LOAD196]] to i64
-; CHECK-NEXT: [[ALLOCA2_SROA_30_28_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[ALLOCA2_SROA_30_28_INSERT_EXT]], 32
-; CHECK-NEXT: [[GETELEMENTPTR201:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 29
-; CHECK-NEXT: [[LOAD202:%.*]] = load i8, ptr [[GETELEMENTPTR201]], align 1
-; CHECK-NEXT: [[ALLOCA2_SROA_30_29_INSERT_EXT:%.*]] = zext i8 [[LOAD202]] to i64
-; CHECK-NEXT: [[ALLOCA2_SROA_30_29_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[ALLOCA2_SROA_30_29_INSERT_EXT]], 40
-; CHECK-NEXT: [[ALLOCA2_SROA_30_29_INSERT_MASK:%.*]] = or disjoint i64 [[TMP0]], [[ALLOCA2_SROA_30_28_INSERT_SHIFT]]
-; CHECK-NEXT: [[GETELEMENTPTR207:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 30
-; CHECK-NEXT: [[LOAD208:%.*]] = load i8, ptr [[GETELEMENTPTR207]], align 1
-; CHECK-NEXT: [[ALLOCA2_SROA_30_30_INSERT_EXT:%.*]] = zext i8 [[LOAD208]] to i64
-; CHECK-NEXT: [[ALLOCA2_SROA_30_30_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[ALLOCA2_SROA_30_30_INSERT_EXT]], 48
-; CHECK-NEXT: [[GETELEMENTPTR213:%.*]] = getelementptr inbounds nuw i8, ptr [[LOAD14]], i64 31
-; CHECK-NEXT: [[LOAD214:%.*]] = load i8, ptr [[GETELEMENTPTR213]], align 1
-; CHECK-NEXT: [[ALLOCA2_SROA_30_31_INSERT_EXT:%.*]] = zext i8 [[LOAD214]] to i64
-; CHECK-NEXT: [[ALLOCA2_SROA_30_31_INSERT_SHIFT:%.*]] = shl nuw i64 [[ALLOCA2_SROA_30_31_INSERT_EXT]], 56
-; CHECK-NEXT: [[ALLOCA2_SROA_30_30_INSERT_MASK_MASKED:%.*]] = or i64 [[ALLOCA2_SROA_30_29_INSERT_MASK]], [[ALLOCA2_SROA_30_29_INSERT_SHIFT]]
-; CHECK-NEXT: [[ALLOCA2_SROA_30_31_INSERT_MASK:%.*]] = or i64 [[ALLOCA2_SROA_30_30_INSERT_MASK_MASKED]], [[ALLOCA2_SROA_30_30_INSERT_SHIFT]]
-; CHECK-NEXT: [[ALLOCA2_SROA_30_31_INSERT_INSERT:%.*]] = or i64 [[ALLOCA2_SROA_30_31_INSERT_MASK]], [[ALLOCA2_SROA_30_31_INSERT_SHIFT]]
-; CHECK-NEXT: [[GETELEMENTPTR216:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG]], i64 40
-; CHECK-NEXT: [[LOAD217:%.*]] = load ptr, ptr [[GETELEMENTPTR216]], align 8
-; CHECK-NEXT: [[LOAD220:%.*]] = load <16 x float>, ptr [[LOAD217]], align 64
-; CHECK-NEXT: [[CALL:%.*]] = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD28]], i64 [[LOAD29]], <16 x float> [[LOAD220]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: store <16 x float> [[CALL]], ptr [[LOAD217]], align 64
-; CHECK-NEXT: [[CALL225:%.*]] = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD73]], i64 [[LOAD76]], <16 x float> [[CALL]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: store <16 x float> [[CALL225]], ptr [[LOAD217]], align 64
-; CHECK-NEXT: [[CALL230:%.*]] = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD121]], i64 [[LOAD124]], <16 x float> [[CALL225]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: [[CALL235:%.*]] = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD169]], i64 [[ALLOCA2_SROA_30_31_INSERT_INSERT]], <16 x float> [[CALL230]], i32 0, i32 0, i32 0)
-; CHECK-NEXT: store <16 x float> [[CALL235]], ptr [[LOAD217]], align 64
-; CHECK-NEXT: ret void
+; CHECK-NEXT: [[CALL:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD28]], i64 0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[CALL225:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD73]], i64 0, <16 x float> [[CALL]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[CALL230:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD121]], i64 0, <16 x float> [[CALL225]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: [[CALL235:%.*]] = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 [[LOAD169]], i64 0, <16 x float> [[CALL230]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: ret <16 x float> [[CALL235]]
;
bb:
- %alloca = alloca %struct.eggs, align 1, addrspace(5)
- %alloca1 = alloca %struct.pluto, align 32, addrspace(5)
- %alloca2 = alloca %struct.pluto, align 32, addrspace(5)
- %alloca3 = alloca %struct.snork, align 1, addrspace(5)
- %alloca4 = alloca %struct.quux, align 8, addrspace(5)
- %addrspacecast = addrspacecast ptr addrspace(5) %alloca to ptr
- %addrspacecast5 = addrspacecast ptr addrspace(5) %alloca1 to ptr
- %addrspacecast6 = addrspacecast ptr addrspace(5) %alloca2 to ptr
- call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) %alloca1)
- call void @llvm.memset.p5.i64(ptr addrspace(5) align 32 %alloca1, i8 0, i64 32, i1 false)
- call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) %alloca2)
- call void @llvm.memset.p5.i64(ptr addrspace(5) align 32 %alloca2, i8 0, i64 32, i1 false)
- call void @llvm.lifetime.start.p5(i64 1, ptr addrspace(5) %alloca3)
- store ptr %addrspacecast5, ptr addrspace(5) %alloca4, align 8
- %getelementptr = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 1
%load = load ptr, ptr %arg, align 8
- store ptr %load, ptr addrspace(5) %getelementptr, align 8
- %getelementptr7 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 2
- %getelementptr8 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 1
- %load9 = load ptr, ptr %getelementptr8, align 8
- store ptr %load9, ptr addrspace(5) %getelementptr7, align 8
- %getelementptr10 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 3
- store ptr %addrspacecast, ptr addrspace(5) %getelementptr10, align 8
- %getelementptr11 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 4
- store ptr %addrspacecast6, ptr addrspace(5) %getelementptr11, align 8
- %getelementptr12 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 5
- %getelementptr13 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 2
- %load14 = load ptr, ptr %getelementptr13, align 8
- store ptr %load14, ptr addrspace(5) %getelementptr12, align 8
- %getelementptr15 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 6
- %getelementptr16 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 3
- %load17 = load ptr, ptr %getelementptr16, align 8
- store ptr %load17, ptr addrspace(5) %getelementptr15, align 8
- %getelementptr18 = getelementptr inbounds %struct.quux, ptr addrspace(5) %alloca4, i64 0, i32 7
- %getelementptr19 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 4
- %load20 = load ptr, ptr %getelementptr19, align 8
- store ptr %load20, ptr addrspace(5) %getelementptr18, align 8
- %load21 = load ptr, ptr addrspace(5) %alloca4, align 8
- %getelementptr22 = getelementptr inbounds i8, ptr addrspace(5) %alloca4, i32 8
- %load23 = load ptr, ptr addrspace(5) %getelementptr22, align 8
- %getelementptr24 = getelementptr inbounds i8, ptr addrspace(5) %alloca4, i32 32
- %load25 = load ptr, ptr addrspace(5) %getelementptr24, align 8
- %getelementptr26 = getelementptr inbounds i8, ptr addrspace(5) %alloca4, i32 40
- %load27 = load ptr, ptr addrspace(5) %getelementptr26, align 8
- %load28 = load i8, ptr %load23, align 1
- store i8 %load28, ptr %load21, align 1
- %load29 = load i8, ptr %load27, align 1
- store i8 %load29, ptr %load25, align 1
- %getelementptr30 = getelementptr inbounds i8, ptr %load23, i64 1
+ %load28 = load i8, ptr %load, align 1
+ %getelementptr30 = getelementptr i8, ptr %load, i64 1
%load31 = load i8, ptr %getelementptr30, align 1
- %getelementptr32 = getelementptr inbounds i8, ptr %load21, i64 1
- store i8 %load31, ptr %getelementptr32, align 1
- %getelementptr33 = getelementptr inbounds i8, ptr %load27, i64 1
- %load34 = load i8, ptr %getelementptr33, align 1
- %getelementptr35 = getelementptr inbounds i8, ptr %load25, i64 1
- store i8 %load34, ptr %getelementptr35, align 1
- %getelementptr36 = getelementptr inbounds i8, ptr %load23, i64 2
+ %getelementptr36 = getelementptr i8, ptr %load, i64 2
%load37 = load i8, ptr %getelementptr36, align 1
- %getelementptr38 = getelementptr inbounds i8, ptr %load21, i64 2
- store i8 %load37, ptr %getelementptr38, align 1
- %getelementptr39 = getelementptr inbounds i8, ptr %load27, i64 2
- %load40 = load i8, ptr %getelementptr39, align 1
- %getelementptr41 = getelementptr inbounds i8, ptr %load25, i64 2
- store i8 %load40, ptr %getelementptr41, align 1
- %getelementptr42 = getelementptr inbounds i8, ptr %load23, i64 3
+ %getelementptr42 = getelementptr i8, ptr %load, i64 3
%load43 = load i8, ptr %getelementptr42, align 1
- %getelementptr44 = getelementptr inbounds i8, ptr %load21, i64 3
- store i8 %load43, ptr %getelementptr44, align 1
- %getelementptr45 = getelementptr inbounds i8, ptr %load27, i64 3
- %load46 = load i8, ptr %getelementptr45, align 1
- %getelementptr47 = getelementptr inbounds i8, ptr %load25, i64 3
- store i8 %load46, ptr %getelementptr47, align 1
- %getelementptr48 = getelementptr inbounds i8, ptr %load23, i64 4
+ %getelementptr48 = getelementptr i8, ptr %load, i64 4
%load49 = load i8, ptr %getelementptr48, align 1
- %getelementptr50 = getelementptr inbounds i8, ptr %load21, i64 4
- store i8 %load49, ptr %getelementptr50, align 1
- %getelementptr51 = getelementptr inbounds i8, ptr %load27, i64 4
- %load52 = load i8, ptr %getelementptr51, align 1
- %getelementptr53 = getelementptr inbounds i8, ptr %load25, i64 4
- store i8 %load52, ptr %getelementptr53, align 1
- %getelementptr54 = getelementptr inbounds i8, ptr %load23, i64 5
+ %getelementptr54 = getelementptr i8, ptr %load, i64 5
%load55 = load i8, ptr %getelementptr54, align 1
- %getelementptr56 = getelementptr inbounds i8, ptr %load21, i64 5
- store i8 %load55, ptr %getelementptr56, align 1
- %getelementptr57 = getelementptr inbounds i8, ptr %load27, i64 5
- %load58 = load i8, ptr %getelementptr57, align 1
- %getelementptr59 = getelementptr inbounds i8, ptr %load25, i64 5
- store i8 %load58, ptr %getelementptr59, align 1
- %getelementptr60 = getelementptr inbounds i8, ptr %load23, i64 6
+ %getelementptr60 = getelementptr i8, ptr %load, i64 6
%load61 = load i8, ptr %getelementptr60, align 1
- %getelementptr62 = getelementptr inbounds i8, ptr %load21, i64 6
- store i8 %load61, ptr %getelementptr62, align 1
- %getelementptr63 = getelementptr inbounds i8, ptr %load27, i64 6
- %load64 = load i8, ptr %getelementptr63, align 1
- %getelementptr65 = getelementptr inbounds i8, ptr %load25, i64 6
- store i8 %load64, ptr %getelementptr65, align 1
- %getelementptr66 = getelementptr inbounds i8, ptr %load23, i64 7
+ %getelementptr66 = getelementptr i8, ptr %load, i64 7
%load67 = load i8, ptr %getelementptr66, align 1
- %getelementptr68 = getelementptr inbounds i8, ptr %load21, i64 7
- store i8 %load67, ptr %getelementptr68, align 1
- %getelementptr69 = getelementptr inbounds i8, ptr %load27, i64 7
- %load70 = load i8, ptr %getelementptr69, align 1
- %getelementptr71 = getelementptr inbounds i8, ptr %load25, i64 7
- store i8 %load70, ptr %getelementptr71, align 1
- %getelementptr72 = getelementptr inbounds i8, ptr %load23, i64 8
+ %getelementptr72 = getelementptr i8, ptr %load, i64 8
%load73 = load i8, ptr %getelementptr72, align 1
- %getelementptr74 = getelementptr inbounds i8, ptr %load21, i64 8
- store i8 %load73, ptr %getelementptr74, align 1
- %getelementptr75 = getelementptr inbounds i8, ptr %load27, i64 8
- %load76 = load i8, ptr %getelementptr75, align 1
- %getelementptr77 = getelementptr inbounds i8, ptr %load25, i64 8
- store i8 %load76, ptr %getelementptr77, align 1
- %getelementptr78 = getelementptr inbounds i8, ptr %load23, i64 9
+ %getelementptr78 = getelementptr i8, ptr %load, i64 9
%load79 = load i8, ptr %getelementptr78, align 1
- %getelementptr80 = getelementptr inbounds i8, ptr %load21, i64 9
- store i8 %load79, ptr %getelementptr80, align 1
- %getelementptr81 = getelementptr inbounds i8, ptr %load27, i64 9
- %load82 = load i8, ptr %getelementptr81, align 1
- %getelementptr83 = getelementptr inbounds i8, ptr %load25, i64 9
- store i8 %load82, ptr %getelementptr83, align 1
- %getelementptr84 = getelementptr inbounds i8, ptr %load23, i64 10
+ %getelementptr84 = getelementptr i8, ptr %load, i64 10
%load85 = load i8, ptr %getelementptr84, align 1
- %getelementptr86 = getelementptr inbounds i8, ptr %load21, i64 10
- store i8 %load85, ptr %getelementptr86, align 1
- %getelementptr87 = getelementptr inbounds i8, ptr %load27, i64 10
- %load88 = load i8, ptr %getelementptr87, align 1
- %getelementptr89 = getelementptr inbounds i8, ptr %load25, i64 10
- store i8 %load88, ptr %getelementptr89, align 1
- %getelementptr90 = getelementptr inbounds i8, ptr %load23, i64 11
+ %getelementptr90 = getelementptr i8, ptr %load, i64 11
%load91 = load i8, ptr %getelementptr90, align 1
- %getelementptr92 = getelementptr inbounds i8, ptr %load21, i64 11
- store i8 %load91, ptr %getelementptr92, align 1
- %getelementptr93 = getelementptr inbounds i8, ptr %load27, i64 11
- %load94 = load i8, ptr %getelementptr93, align 1
- %getelementptr95 = getelementptr inbounds i8, ptr %load25, i64 11
- store i8 %load94, ptr %getelementptr95, align 1
- %getelementptr96 = getelementptr inbounds i8, ptr %load23, i64 12
+ %getelementptr96 = getelementptr i8, ptr %load, i64 12
%load97 = load i8, ptr %getelementptr96, align 1
- %getelementptr98 = getelementptr inbounds i8, ptr %load21, i64 12
- store i8 %load97, ptr %getelementptr98, align 1
- %getelementptr99 = getelementptr inbounds i8, ptr %load27, i64 12
- %load100 = load i8, ptr %getelementptr99, align 1
- %getelementptr101 = getelementptr inbounds i8, ptr %load25, i64 12
- store i8 %load100, ptr %getelementptr101, align 1
- %getelementptr102 = getelementptr inbounds i8, ptr %load23, i64 13
+ %getelementptr102 = getelementptr i8, ptr %load, i64 13
%load103 = load i8, ptr %getelementptr102, align 1
- %getelementptr104 = getelementptr inbounds i8, ptr %load21, i64 13
- store i8 %load103, ptr %getelementptr104, align 1
- %getelementptr105 = getelementptr inbounds i8, ptr %load27, i64 13
- %load106 = load i8, ptr %getelementptr105, align 1
- %getelementptr107 = getelementptr inbounds i8, ptr %load25, i64 13
- store i8 %load106, ptr %getelementptr107, align 1
- %getelementptr108 = getelementptr inbounds i8, ptr %load23, i64 14
+ %getelementptr108 = getelementptr i8, ptr %load, i64 14
%load109 = load i8, ptr %getelementptr108, align 1
- %getelementptr110 = getelementptr inbounds i8, ptr %load21, i64 14
- store i8 %load109, ptr %getelementptr110, align 1
- %getelementptr111 = getelementptr inbounds i8, ptr %load27, i64 14
- %load112 = load i8, ptr %getelementptr111, align 1
- %getelementptr113 = getelementptr inbounds i8, ptr %load25, i64 14
- store i8 %load112, ptr %getelementptr113, align 1
- %getelementptr114 = getelementptr inbounds i8, ptr %load23, i64 15
+ %getelementptr114 = getelementptr i8, ptr %load, i64 15
%load115 = load i8, ptr %getelementptr114, align 1
- %getelementptr116 = getelementptr inbounds i8, ptr %load21, i64 15
- store i8 %load115, ptr %getelementptr116, align 1
- %getelementptr117 = getelementptr inbounds i8, ptr %load27, i64 15
- %load118 = load i8, ptr %getelementptr117, align 1
- %getelementptr119 = getelementptr inbounds i8, ptr %load25, i64 15
- store i8 %load118, ptr %getelementptr119, align 1
- %getelementptr120 = getelementptr inbounds i8, ptr %load23, i64 16
+ %getelementptr120 = getelementptr i8, ptr %load, i64 16
%load121 = load i8, ptr %getelementptr120, align 1
- %getelementptr122 = getelementptr inbounds i8, ptr %load21, i64 16
- store i8 %load121, ptr %getelementptr122, align 1
- %getelementptr123 = getelementptr inbounds i8, ptr %load27, i64 16
- %load124 = load i8, ptr %getelementptr123, align 1
- %getelementptr125 = getelementptr inbounds i8, ptr %load25, i64 16
- store i8 %load124, ptr %getelementptr125, align 1
- %getelementptr126 = getelementptr inbounds i8, ptr %load23, i64 17
+ %getelementptr126 = getelementptr i8, ptr %load, i64 17
%load127 = load i8, ptr %getelementptr126, align 1
- %getelementptr128 = getelementptr inbounds i8, ptr %load21, i64 17
- store i8 %load127, ptr %getelementptr128, align 1
- %getelementptr129 = getelementptr inbounds i8, ptr %load27, i64 17
- %load130 = load i8, ptr %getelementptr129, align 1
- %getelementptr131 = getelementptr inbounds i8, ptr %load25, i64 17
- store i8 %load130, ptr %getelementptr131, align 1
- %getelementptr132 = getelementptr inbounds i8, ptr %load23, i64 18
+ %getelementptr132 = getelementptr i8, ptr %load, i64 18
%load133 = load i8, ptr %getelementptr132, align 1
- %getelementptr134 = getelementptr inbounds i8, ptr %load21, i64 18
- store i8 %load133, ptr %getelementptr134, align 1
- %getelementptr135 = getelementptr inbounds i8, ptr %load27, i64 18
- %load136 = load i8, ptr %getelementptr135, align 1
- %getelementptr137 = getelementptr inbounds i8, ptr %load25, i64 18
- store i8 %load136, ptr %getelementptr137, align 1
- %getelementptr138 = getelementptr inbounds i8, ptr %load23, i64 19
+ %getelementptr138 = getelementptr i8, ptr %load, i64 19
%load139 = load i8, ptr %getelementptr138, align 1
- %getelementptr140 = getelementptr inbounds i8, ptr %load21, i64 19
- store i8 %load139, ptr %getelementptr140, align 1
- %getelementptr141 = getelementptr inbounds i8, ptr %load27, i64 19
- %load142 = load i8, ptr %getelementptr141, align 1
- %getelementptr143 = getelementptr inbounds i8, ptr %load25, i64 19
- store i8 %load142, ptr %getelementptr143, align 1
- %getelementptr144 = getelementptr inbounds i8, ptr %load23, i64 20
+ %getelementptr144 = getelementptr i8, ptr %load, i64 20
%load145 = load i8, ptr %getelementptr144, align 1
- %getelementptr146 = getelementptr inbounds i8, ptr %load21, i64 20
- store i8 %load145, ptr %getelementptr146, align 1
- %getelementptr147 = getelementptr inbounds i8, ptr %load27, i64 20
- %load148 = load i8, ptr %getelementptr147, align 1
- %getelementptr149 = getelementptr inbounds i8, ptr %load25, i64 20
- store i8 %load148, ptr %getelementptr149, align 1
- %getelementptr150 = getelementptr inbounds i8, ptr %load23, i64 21
+ %getelementptr150 = getelementptr i8, ptr %load, i64 21
%load151 = load i8, ptr %getelementptr150, align 1
- %getelementptr152 = getelementptr inbounds i8, ptr %load21, i64 21
- store i8 %load151, ptr %getelementptr152, align 1
- %getelementptr153 = getelementptr inbounds i8, ptr %load27, i64 21
- %load154 = load i8, ptr %getelementptr153, align 1
- %getelementptr155 = getelementptr inbounds i8, ptr %load25, i64 21
- store i8 %load154, ptr %getelementptr155, align 1
- %getelementptr156 = getelementptr inbounds i8, ptr %load23, i64 22
+ %getelementptr156 = getelementptr i8, ptr %load, i64 22
%load157 = load i8, ptr %getelementptr156, align 1
- %getelementptr158 = getelementptr inbounds i8, ptr %load21, i64 22
- store i8 %load157, ptr %getelementptr158, align 1
- %getelementptr159 = getelementptr inbounds i8, ptr %load27, i64 22
- %load160 = load i8, ptr %getelementptr159, align 1
- %getelementptr161 = getelementptr inbounds i8, ptr %load25, i64 22
- store i8 %load160, ptr %getelementptr161, align 1
- %getelementptr162 = getelementptr inbounds i8, ptr %load23, i64 23
+ %getelementptr162 = getelementptr i8, ptr %load, i64 23
%load163 = load i8, ptr %getelementptr162, align 1
- %getelementptr164 = getelementptr inbounds i8, ptr %load21, i64 23
- store i8 %load163, ptr %getelementptr164, align 1
- %getelementptr165 = getelementptr inbounds i8, ptr %load27, i64 23
- %load166 = load i8, ptr %getelementptr165, align 1
- %getelementptr167 = getelementptr inbounds i8, ptr %load25, i64 23
- store i8 %load166, ptr %getelementptr167, align 1
- %getelementptr168 = getelementptr inbounds i8, ptr %load23, i64 24
+ %getelementptr168 = getelementptr i8, ptr %load, i64 24
%load169 = load i8, ptr %getelementptr168, align 1
- %getelementptr170 = getelementptr inbounds i8, ptr %load21, i64 24
- store i8 %load169, ptr %getelementptr170, align 1
- %getelementptr171 = getelementptr inbounds i8, ptr %load27, i64 24
- %load172 = load i8, ptr %getelementptr171, align 1
- %getelementptr173 = getelementptr inbounds i8, ptr %load25, i64 24
- store i8 %load172, ptr %getelementptr173, align 1
- %getelementptr174 = getelementptr inbounds i8, ptr %load23, i64 25
+ %getelementptr174 = getelementptr i8, ptr %load, i64 25
%load175 = load i8, ptr %getelementptr174, align 1
- %getelementptr176 = getelementptr inbounds i8, ptr %load21, i64 25
- store i8 %load175, ptr %getelementptr176, align 1
- %getelementptr177 = getelementptr inbounds i8, ptr %load27, i64 25
- %load178 = load i8, ptr %getelementptr177, align 1
- %getelementptr179 = getelementptr inbounds i8, ptr %load25, i64 25
- store i8 %load178, ptr %getelementptr179, align 1
- %getelementptr180 = getelementptr inbounds i8, ptr %load23, i64 26
+ %getelementptr180 = getelementptr i8, ptr %load, i64 26
%load181 = load i8, ptr %getelementptr180, align 1
- %getelementptr182 = getelementptr inbounds i8, ptr %load21, i64 26
- store i8 %load181, ptr %getelementptr182, align 1
- %getelementptr183 = getelementptr inbounds i8, ptr %load27, i64 26
- %load184 = load i8, ptr %getelementptr183, align 1
- %getelementptr185 = getelementptr inbounds i8, ptr %load25, i64 26
- store i8 %load184, ptr %getelementptr185, align 1
- %getelementptr186 = getelementptr inbounds i8, ptr %load23, i64 27
+ %getelementptr186 = getelementptr i8, ptr %load, i64 27
%load187 = load i8, ptr %getelementptr186, align 1
- %getelementptr188 = getelementptr inbounds i8, ptr %load21, i64 27
- store i8 %load187, ptr %getelementptr188, align 1
- %getelementptr189 = getelementptr inbounds i8, ptr %load27, i64 27
- %load190 = load i8, ptr %getelementptr189, align 1
- %getelementptr191 = getelementptr inbounds i8, ptr %load25, i64 27
- store i8 %load190, ptr %getelementptr191, align 1
- %getelementptr192 = getelementptr inbounds i8, ptr %load23, i64 28
+ %getelementptr192 = getelementptr i8, ptr %load, i64 28
%load193 = load i8, ptr %getelementptr192, align 1
- %getelementptr194 = getelementptr inbounds i8, ptr %load21, i64 28
- store i8 %load193, ptr %getelementptr194, align 1
- %getelementptr195 = getelementptr inbounds i8, ptr %load27, i64 28
- %load196 = load i8, ptr %getelementptr195, align 1
- %getelementptr197 = getelementptr inbounds i8, ptr %load25, i64 28
- store i8 %load196, ptr %getelementptr197, align 1
- %getelementptr198 = getelementptr inbounds i8, ptr %load23, i64 29
+ %getelementptr198 = getelementptr i8, ptr %load, i64 29
%load199 = load i8, ptr %getelementptr198, align 1
- %getelementptr200 = getelementptr inbounds i8, ptr %load21, i64 29
- store i8 %load199, ptr %getelementptr200, align 1
- %getelementptr201 = getelementptr inbounds i8, ptr %load27, i64 29
- %load202 = load i8, ptr %getelementptr201, align 1
- %getelementptr203 = getelementptr inbounds i8, ptr %load25, i64 29
- store i8 %load202, ptr %getelementptr203, align 1
- %getelementptr204 = getelementptr inbounds i8, ptr %load23, i64 30
+ %getelementptr204 = getelementptr i8, ptr %load, i64 30
%load205 = load i8, ptr %getelementptr204, align 1
- %getelementptr206 = getelementptr inbounds i8, ptr %load21, i64 30
- store i8 %load205, ptr %getelementptr206, align 1
- %getelementptr207 = getelementptr inbounds i8, ptr %load27, i64 30
- %load208 = load i8, ptr %getelementptr207, align 1
- %getelementptr209 = getelementptr inbounds i8, ptr %load25, i64 30
- store i8 %load208, ptr %getelementptr209, align 1
- %getelementptr210 = getelementptr inbounds i8, ptr %load23, i64 31
+ %getelementptr210 = getelementptr i8, ptr %load, i64 31
%load211 = load i8, ptr %getelementptr210, align 1
- %getelementptr212 = getelementptr inbounds i8, ptr %load21, i64 31
- store i8 %load211, ptr %getelementptr212, align 1
- %getelementptr213 = getelementptr inbounds i8, ptr %load27, i64 31
- %load214 = load i8, ptr %getelementptr213, align 1
- %getelementptr215 = getelementptr inbounds i8, ptr %load25, i64 31
- store i8 %load214, ptr %getelementptr215, align 1
- call void @llvm.lifetime.end.p5(i64 1, ptr addrspace(5) %alloca3)
- %getelementptr216 = getelementptr inbounds %struct.bar, ptr %arg, i64 0, i32 5
- %load217 = load ptr, ptr %getelementptr216, align 8
- %load218 = load i64, ptr addrspace(5) %alloca1, align 8
- %load219 = load i64, ptr addrspace(5) %alloca2, align 8
- %load220 = load <16 x float>, ptr %load217, align 64
- %call = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %load218, i64 %load219, <16 x float> %load220, i32 0, i32 0, i32 0)
- store <16 x float> %call, ptr %load217, align 64
- %getelementptr221 = getelementptr inbounds i8, ptr addrspace(5) %alloca1, i64 8
- %getelementptr222 = getelementptr inbounds i8, ptr addrspace(5) %alloca2, i64 8
- %load223 = load i64, ptr addrspace(5) %getelementptr221, align 8
- %load224 = load i64, ptr addrspace(5) %getelementptr222, align 8
- %call225 = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %load223, i64 %load224, <16 x float> %call, i32 0, i32 0, i32 0)
- store <16 x float> %call225, ptr %load217, align 64
- %getelementptr226 = getelementptr inbounds i8, ptr addrspace(5) %alloca1, i64 16
- %getelementptr227 = getelementptr inbounds i8, ptr addrspace(5) %alloca2, i64 16
- %load228 = load i64, ptr addrspace(5) %getelementptr226, align 8
- %load229 = load i64, ptr addrspace(5) %getelementptr227, align 8
- %call230 = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %load228, i64 %load229, <16 x float> %call225, i32 0, i32 0, i32 0)
- store <16 x float> %call230, ptr %load217, align 64
- %getelementptr231 = getelementptr inbounds i8, ptr addrspace(5) %alloca1, i64 24
- %getelementptr232 = getelementptr inbounds i8, ptr addrspace(5) %alloca2, i64 24
- %load233 = load i64, ptr addrspace(5) %getelementptr231, align 8
- %load234 = load i64, ptr addrspace(5) %getelementptr232, align 8
- %call235 = call contract <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %load233, i64 %load234, <16 x float> %call230, i32 0, i32 0, i32 0)
- store <16 x float> %call235, ptr %load217, align 64
- call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) %alloca2)
- call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) %alloca1)
- ret void
+ %alloca1.sroa.8.0.insert.ext = zext i8 %load67 to i64
+ %alloca1.sroa.8.0.insert.shift = shl i64 %alloca1.sroa.8.0.insert.ext, 56
+ %alloca1.sroa.7.0.insert.ext = zext i8 %load61 to i64
+ %alloca1.sroa.7.0.insert.shift = shl i64 %alloca1.sroa.7.0.insert.ext, 48
+ %alloca1.sroa.7.0.insert.insert = or i64 %alloca1.sroa.8.0.insert.shift, %alloca1.sroa.7.0.insert.shift
+ %alloca1.sroa.6.0.insert.ext = zext i8 %load55 to i64
+ %alloca1.sroa.6.0.insert.shift = shl i64 %alloca1.sroa.6.0.insert.ext, 40
+ %alloca1.sroa.6.0.insert.insert = or i64 %alloca1.sroa.7.0.insert.insert, %alloca1.sroa.6.0.insert.shift
+ %alloca1.sroa.5.0.insert.ext = zext i8 %load49 to i64
+ %alloca1.sroa.5.0.insert.shift = shl i64 %alloca1.sroa.5.0.insert.ext, 32
+ %alloca1.sroa.5.0.insert.insert = or i64 %alloca1.sroa.6.0.insert.insert, %alloca1.sroa.5.0.insert.shift
+ %alloca1.sroa.4.0.insert.ext = zext i8 %load43 to i64
+ %alloca1.sroa.4.0.insert.shift = shl i64 %alloca1.sroa.4.0.insert.ext, 24
+ %alloca1.sroa.4.0.insert.insert = or i64 %alloca1.sroa.5.0.insert.insert, %alloca1.sroa.4.0.insert.shift
+ %alloca1.sroa.3.0.insert.ext = zext i8 %load37 to i64
+ %alloca1.sroa.3.0.insert.shift = shl i64 %alloca1.sroa.3.0.insert.ext, 16
+ %alloca1.sroa.2.0.insert.ext = zext i8 %load31 to i64
+ %alloca1.sroa.2.0.insert.shift = shl i64 %alloca1.sroa.2.0.insert.ext, 8
+ %alloca1.sroa.2.0.insert.mask = or i64 %alloca1.sroa.4.0.insert.insert, %alloca1.sroa.3.0.insert.shift
+ %alloca1.sroa.0.0.insert.ext = zext i8 %load28 to i64
+ %alloca1.sroa.0.0.insert.mask = or i64 %alloca1.sroa.2.0.insert.mask, %alloca1.sroa.2.0.insert.shift
+ %alloca1.sroa.0.0.insert.insert = or i64 %alloca1.sroa.0.0.insert.mask, %alloca1.sroa.0.0.insert.ext
+ %call = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.0.0.insert.insert, i64 0, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+ %alloca1.sroa.17.8.insert.ext = zext i8 %load115 to i64
+ %alloca1.sroa.17.8.insert.shift = shl i64 %alloca1.sroa.17.8.insert.ext, 56
+ %alloca1.sroa.16.8.insert.ext = zext i8 %load109 to i64
+ %alloca1.sroa.16.8.insert.shift = shl i64 %alloca1.sroa.16.8.insert.ext, 48
+ %alloca1.sroa.16.8.insert.insert = or i64 %alloca1.sroa.17.8.insert.shift, %alloca1.sroa.16.8.insert.shift
+ %alloca1.sroa.15.8.insert.ext = zext i8 %load103 to i64
+ %alloca1.sroa.15.8.insert.shift = shl i64 %alloca1.sroa.15.8.insert.ext, 40
+ %alloca1.sroa.15.8.insert.insert = or i64 %alloca1.sroa.16.8.insert.insert, %alloca1.sroa.15.8.insert.shift
+ %alloca1.sroa.14.8.insert.ext = zext i8 %load97 to i64
+ %alloca1.sroa.14.8.insert.shift = shl i64 %alloca1.sroa.14.8.insert.ext, 32
+ %alloca1.sroa.14.8.insert.insert = or i64 %alloca1.sroa.15.8.insert.insert, %alloca1.sroa.14.8.insert.shift
+ %alloca1.sroa.13.8.insert.ext = zext i8 %load91 to i64
+ %alloca1.sroa.13.8.insert.shift = shl i64 %alloca1.sroa.13.8.insert.ext, 24
+ %alloca1.sroa.13.8.insert.insert = or i64 %alloca1.sroa.14.8.insert.insert, %alloca1.sroa.13.8.insert.shift
+ %alloca1.sroa.12.8.insert.ext = zext i8 %load85 to i64
+ %alloca1.sroa.12.8.insert.shift = shl i64 %alloca1.sroa.12.8.insert.ext, 16
+ %alloca1.sroa.11.8.insert.ext = zext i8 %load79 to i64
+ %alloca1.sroa.11.8.insert.shift = shl i64 %alloca1.sroa.11.8.insert.ext, 8
+ %alloca1.sroa.11.8.insert.mask = or i64 %alloca1.sroa.13.8.insert.insert, %alloca1.sroa.12.8.insert.shift
+ %alloca1.sroa.9.8.insert.ext = zext i8 %load73 to i64
+ %alloca1.sroa.9.8.insert.mask = or i64 %alloca1.sroa.11.8.insert.mask, %alloca1.sroa.11.8.insert.shift
+ %alloca1.sroa.9.8.insert.insert = or i64 %alloca1.sroa.9.8.insert.mask, %alloca1.sroa.9.8.insert.ext
+ %call225 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.9.8.insert.insert, i64 0, <16 x float> %call, i32 0, i32 0, i32 0)
+ %alloca1.sroa.26.16.insert.ext = zext i8 %load163 to i64
+ %alloca1.sroa.26.16.insert.shift = shl i64 %alloca1.sroa.26.16.insert.ext, 56
+ %alloca1.sroa.25.16.insert.ext = zext i8 %load157 to i64
+ %alloca1.sroa.25.16.insert.shift = shl i64 %alloca1.sroa.25.16.insert.ext, 48
+ %alloca1.sroa.25.16.insert.insert = or i64 %alloca1.sroa.26.16.insert.shift, %alloca1.sroa.25.16.insert.shift
+ %alloca1.sroa.24.16.insert.ext = zext i8 %load151 to i64
+ %alloca1.sroa.24.16.insert.shift = shl i64 %alloca1.sroa.24.16.insert.ext, 40
+ %alloca1.sroa.24.16.insert.insert = or i64 %alloca1.sroa.25.16.insert.insert, %alloca1.sroa.24.16.insert.shift
+ %alloca1.sroa.23.16.insert.ext = zext i8 %load145 to i64
+ %alloca1.sroa.23.16.insert.shift = shl i64 %alloca1.sroa.23.16.insert.ext, 32
+ %alloca1.sroa.23.16.insert.insert = or i64 %alloca1.sroa.24.16.insert.insert, %alloca1.sroa.23.16.insert.shift
+ %alloca1.sroa.22.16.insert.ext = zext i8 %load139 to i64
+ %alloca1.sroa.22.16.insert.shift = shl i64 %alloca1.sroa.22.16.insert.ext, 24
+ %alloca1.sroa.22.16.insert.insert = or i64 %alloca1.sroa.23.16.insert.insert, %alloca1.sroa.22.16.insert.shift
+ %alloca1.sroa.21.16.insert.ext = zext i8 %load133 to i64
+ %alloca1.sroa.21.16.insert.shift = shl i64 %alloca1.sroa.21.16.insert.ext, 16
+ %alloca1.sroa.20.16.insert.ext = zext i8 %load127 to i64
+ %alloca1.sroa.20.16.insert.shift = shl i64 %alloca1.sroa.20.16.insert.ext, 8
+ %alloca1.sroa.20.16.insert.mask = or i64 %alloca1.sroa.22.16.insert.insert, %alloca1.sroa.21.16.insert.shift
+ %alloca1.sroa.18.16.insert.ext = zext i8 %load121 to i64
+ %alloca1.sroa.18.16.insert.mask = or i64 %alloca1.sroa.20.16.insert.mask, %alloca1.sroa.20.16.insert.shift
+ %alloca1.sroa.18.16.insert.insert = or i64 %alloca1.sroa.18.16.insert.mask, %alloca1.sroa.18.16.insert.ext
+ %call230 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.18.16.insert.insert, i64 0, <16 x float> %call225, i32 0, i32 0, i32 0)
+ %alloca1.sroa.35.24.insert.ext = zext i8 %load211 to i64
+ %alloca1.sroa.35.24.insert.shift = shl i64 %alloca1.sroa.35.24.insert.ext, 56
+ %alloca1.sroa.34.24.insert.ext = zext i8 %load205 to i64
+ %alloca1.sroa.34.24.insert.shift = shl i64 %alloca1.sroa.34.24.insert.ext, 48
+ %alloca1.sroa.34.24.insert.insert = or i64 %alloca1.sroa.35.24.insert.shift, %alloca1.sroa.34.24.insert.shift
+ %alloca1.sroa.33.24.insert.ext = zext i8 %load199 to i64
+ %alloca1.sroa.33.24.insert.shift = shl i64 %alloca1.sroa.33.24.insert.ext, 40
+ %alloca1.sroa.33.24.insert.insert = or i64 %alloca1.sroa.34.24.insert.insert, %alloca1.sroa.33.24.insert.shift
+ %alloca1.sroa.32.24.insert.ext = zext i8 %load193 to i64
+ %alloca1.sroa.32.24.insert.shift = shl i64 %alloca1.sroa.32.24.insert.ext, 32
+ %alloca1.sroa.32.24.insert.insert = or i64 %alloca1.sroa.33.24.insert.insert, %alloca1.sroa.32.24.insert.shift
+ %alloca1.sroa.31.24.insert.ext = zext i8 %load187 to i64
+ %alloca1.sroa.31.24.insert.shift = shl i64 %alloca1.sroa.31.24.insert.ext, 24
+ %alloca1.sroa.31.24.insert.insert = or i64 %alloca1.sroa.32.24.insert.insert, %alloca1.sroa.31.24.insert.shift
+ %alloca1.sroa.30.24.insert.ext = zext i8 %load181 to i64
+ %alloca1.sroa.30.24.insert.shift = shl i64 %alloca1.sroa.30.24.insert.ext, 16
+ %alloca1.sroa.29.24.insert.ext = zext i8 %load175 to i64
+ %alloca1.sroa.29.24.insert.shift = shl i64 %alloca1.sroa.29.24.insert.ext, 8
+ %alloca1.sroa.29.24.insert.mask = or i64 %alloca1.sroa.31.24.insert.insert, %alloca1.sroa.30.24.insert.shift
+ %alloca1.sroa.27.24.insert.ext = zext i8 %load169 to i64
+ %alloca1.sroa.27.24.insert.mask = or i64 %alloca1.sroa.29.24.insert.mask, %alloca1.sroa.29.24.insert.shift
+ %alloca1.sroa.27.24.insert.insert = or i64 %alloca1.sroa.27.24.insert.mask, %alloca1.sroa.27.24.insert.ext
+ %call235 = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %alloca1.sroa.27.24.insert.insert, i64 0, <16 x float> %call230, i32 0, i32 0, i32 0)
+ ret <16 x float> %call235
}
; Function Attrs: convergent nocallback nofree nosync nounwind willreturn memory(none)
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64, i64, <16 x float>, i32 immarg, i32 immarg, i32 immarg) #0
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) captures(none)) #1
-
-; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) captures(none)) #1
-
-; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: write)
-declare void @llvm.memset.p5.i64(ptr addrspace(5) writeonly captures(none), i8, i64, i1 immarg) #2
+; uselistorder directives
+uselistorder ptr @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8, { 3, 2, 1, 0 }
attributes #0 = { convergent nocallback nofree nosync nounwind willreturn memory(none) }
-attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
>From d9d6fcfbea709a7fa75fcd06dd1b6a5853b2832c Mon Sep 17 00:00:00 2001
From: Matthew Curtis <macurtis at amd.com>
Date: Fri, 12 Sep 2025 03:07:11 -0500
Subject: [PATCH 4/4] fixup! [AMDGPU] Enable more consecutive load folding
during aggressive-instcombine
---
.../AggressiveInstCombine/{ => AMDGPU}/fold-consecutive-loads.ll | 0
1 file changed, 0 insertions(+), 0 deletions(-)
rename llvm/test/Transforms/AggressiveInstCombine/{ => AMDGPU}/fold-consecutive-loads.ll (100%)
diff --git a/llvm/test/Transforms/AggressiveInstCombine/fold-consecutive-loads.ll b/llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll
similarity index 100%
rename from llvm/test/Transforms/AggressiveInstCombine/fold-consecutive-loads.ll
rename to llvm/test/Transforms/AggressiveInstCombine/AMDGPU/fold-consecutive-loads.ll
More information about the llvm-commits
mailing list