[llvm-branch-commits] [llvm] [LowerMemIntrinsics][AMDGPU] Optimize memset.pattern lowering (PR #185901)

Fabian Ritter via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Thu Mar 12 10:26:03 PDT 2026


https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/185901

>From 90d8edfae004fe3a504c99cb055317e960b639f0 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Wed, 11 Mar 2026 08:23:28 -0400
Subject: [PATCH 1/3] [LowerMemIntrinsics][AMDGPU] Optimize memset.pattern
 lowering

This patch changes the lowering of the [experimental.memset.pattern intrinsic](https://llvm.org/docs/LangRef.html#llvm-experimental-memset-pattern-intrinsic)
to match the optimized memset and memcpy lowering when possible. (The tl;dr of
memset.pattern is that it is like memset, except that you can use it to set
values that are wider than a single byte.)

The memset.pattern lowering now queries `TTI::getMemcpyLoopLoweringType` for a
preferred memory access type. If the size of that type is a multiple of the set
value's type, and if both types have consistent store and alloc sizes (since
memset.pattern behaves in a way that is not well suitable for access widening
if store and alloc size differ), the memset.pattern is lowered into two loops:
a main loop that stores a sufficiently wide vector splat of the SetValue with
the preferred memory access type and a residual loop that covers the remaining
set values individually.

In contrast to the memset lowering, this patch doesn't include a specialized
lowering for residual loops with known constant lengths. Loops that are
statically known to be unreachable will not be emitted.

For backends that don't override `TTI::getMemcpyLoopLoweringType`, the
generated code is mostly unchanged except for more consistent basic block
names, no more `br i1 false` for memset.patterns with known size, and a flipped
loop condition for memset.patterns with known size (see test changes).

This is a follow-up to a similar patch for memset: #169040
---
 .../Transforms/Utils/LowerMemIntrinsics.h     |  10 +-
 llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp |   6 +-
 .../AMDGPU/AMDGPULowerBufferFatPointers.cpp   |   3 +-
 .../Transforms/Utils/LowerMemIntrinsics.cpp   | 175 ++--
 .../lower-buffer-fat-pointers-mem-transfer.ll | 134 +++-
 llvm/test/CodeGen/AMDGPU/memset-pattern.ll    | 745 ++++++++++++++++++
 llvm/test/CodeGen/RISCV/memset-pattern.ll     |  62 +-
 .../AMDGPU/memset-pattern.ll                  | 273 +++++++
 .../PowerPC/memset-pattern.ll                 |   4 +-
 .../RISCV/memset-pattern.ll                   |  28 +-
 .../X86/memset-pattern.ll                     |  16 +-
 11 files changed, 1304 insertions(+), 152 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/memset-pattern.ll
 create mode 100644 llvm/test/Transforms/PreISelIntrinsicLowering/AMDGPU/memset-pattern.ll

diff --git a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
index 1feb614f043bf..55c7119cd8fdb 100644
--- a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
+++ b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
@@ -72,7 +72,15 @@ LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet,
                                  const TargetTransformInfo &TTI);
 
 /// Expand \p MemSetPattern as a loop. \p MemSet is not deleted.
-LLVM_ABI void expandMemSetPatternAsLoop(MemSetPatternInst *MemSet);
+/// If \p TTI is provided, the memset.pattern is expanded according to the
+/// target's preferences. Otherwise, it is expanded as an element-wise loop.
+LLVM_ABI void
+expandMemSetPatternAsLoop(MemSetPatternInst *MemSet,
+                          const TargetTransformInfo *TTI = nullptr);
+
+/// Expand \p MemSetPattern as a loop. \p MemSet is not deleted.
+LLVM_ABI void expandMemSetPatternAsLoop(MemSetPatternInst *MemSet,
+                                        const TargetTransformInfo &TTI);
 
 /// Expand \p AtomicMemCpy as a loop. \p AtomicMemCpy is not deleted.
 LLVM_ABI void expandAtomicMemCpyAsLoop(AnyMemCpyInst *AtomicMemCpy,
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 0544995f979f7..3ee4e074a7083 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -410,12 +410,14 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
     }
     case Intrinsic::experimental_memset_pattern: {
       auto *Memset = cast<MemSetPatternInst>(Inst);
-      const TargetLibraryInfo &TLI = LookupTLI(*Memset->getFunction());
+      Function *ParentFunc = Memset->getFunction();
+      const TargetLibraryInfo &TLI = LookupTLI(*ParentFunc);
       Constant *PatternValue = getMemSetPattern16Value(Memset, TLI);
       if (!PatternValue) {
         // If it isn't possible to emit a memset_pattern16 libcall, expand to
         // a loop instead.
-        expandMemSetPatternAsLoop(Memset);
+        const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
+        expandMemSetPatternAsLoop(Memset, TTI);
         Changed = true;
         Memset->eraseFromParent();
         break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 70ef607e6a57d..3ab23562ff1f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -645,7 +645,8 @@ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetPatternInst(
     MemSetPatternInst &MSPI) {
   if (MSPI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
     return false;
-  llvm::expandMemSetPatternAsLoop(&MSPI);
+  llvm::expandMemSetPatternAsLoop(
+      &MSPI, TM->getTargetTransformInfo(*MSPI.getFunction()));
   MSPI.eraseFromParent();
   return true;
 }
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index da7260cdbac31..452de63ca76b8 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -1307,62 +1307,103 @@ createMemSetLoopUnknownSize(Instruction *InsertBefore, Value *DstAddr,
                                     IsVolatile);
 }
 
-static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
-                             Value *CopyLen, Value *SetValue, Align DstAlign,
-                             std::optional<uint64_t> AverageTripCount,
-                             bool IsVolatile) {
-  // Currently no longer used for memset, only for memset.pattern.
-  // TODO: Update the memset.pattern lowering to also use the loop expansion
-  //       framework and remove this function.
-  Type *TypeOfCopyLen = CopyLen->getType();
-  BasicBlock *OrigBB = InsertBefore->getParent();
-  Function *F = OrigBB->getParent();
-  const DataLayout &DL = F->getDataLayout();
-  BasicBlock *NewBB =
-      OrigBB->splitBasicBlock(InsertBefore, "split");
-  BasicBlock *LoopBB
-    = BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB);
+static void createMemSetPatternLoop(Instruction *InsertBefore, Value *DstAddr,
+                                    Value *Len, Value *SetValue, Align DstAlign,
+                                    bool IsVolatile,
+                                    const TargetTransformInfo *TTI,
+                                    std::optional<uint64_t> AverageTripCount) {
+  // No need to expand zero length memset.pattern.
+  if (auto *CLen = dyn_cast<ConstantInt>(Len))
+    if (CLen->isZero())
+      return;
 
-  const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
-  IRBuilder<> Builder(OrigBB->getTerminator());
-  Builder.SetCurrentDebugLocation(DbgLoc);
-
-  auto *ToLoopBR = Builder.CreateCondBr(
-      Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
-      LoopBB);
-  MDBuilder MDB(F->getContext());
-  if (AverageTripCount.has_value())
-    ToLoopBR->setMetadata(LLVMContext::MD_prof,
-                          MDB.createLikelyBranchWeights());
-  else
-    setExplicitlyUnknownBranchWeightsIfProfiled(*ToLoopBR, DEBUG_TYPE);
-
-  OrigBB->getTerminator()->eraseFromParent();
-
-  TypeSize PartSize = DL.getTypeStoreSize(SetValue->getType());
-  Align PartAlign(commonAlignment(DstAlign, PartSize));
-
-  IRBuilder<> LoopBuilder(LoopBB);
-  LoopBuilder.SetCurrentDebugLocation(DbgLoc);
-  PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
-  LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
-
-  LoopBuilder.CreateAlignedStore(
-      SetValue,
-      LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
-      PartAlign, IsVolatile);
-
-  Value *NewIndex =
-      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
-  LoopIndex->addIncoming(NewIndex, LoopBB);
-
-  auto *LoopBR = LoopBuilder.CreateCondBr(
-      LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB, NewBB);
-  if (AverageTripCount.has_value())
-    setFittedBranchWeights(*LoopBR, {AverageTripCount.value(), 1},
-                           /*IsExpected=*/false);
-  else
-    setExplicitlyUnknownBranchWeightsIfProfiled(*LoopBR, DEBUG_TYPE);
+  BasicBlock *PreLoopBB = InsertBefore->getParent();
+  Function *ParentFunc = PreLoopBB->getParent();
+  const DataLayout &DL = ParentFunc->getDataLayout();
+  LLVMContext &Ctx = PreLoopBB->getContext();
+
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  Type *PreferredLoopOpType = SetValue->getType();
+  if (TTI) {
+    PreferredLoopOpType = TTI->getMemcpyLoopLoweringType(
+        Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt);
+  }
+  TypeSize PreferredLoopOpStoreSize = DL.getTypeStoreSize(PreferredLoopOpType);
+  assert(PreferredLoopOpStoreSize.isFixed() &&
+         "PreferredLoopOpType cannot be a scalable vector type");
+
+  TypeSize PreferredLoopOpAllocSize = DL.getTypeAllocSize(PreferredLoopOpType);
+
+  Type *OriginalType = SetValue->getType();
+  TypeSize OriginalTypeStoreSize = DL.getTypeStoreSize(OriginalType);
+  TypeSize OriginalTypeAllocSize = DL.getTypeAllocSize(OriginalType);
+
+  // The semantics of memset.pattern restrict what vectorization we can do: It
+  // has to behave like a series of stores of the SetValue type at offsets that
+  // are spaced by the alloc size of the SetValue type. If store and alloc size
+  // of the SetValue type don't match, the bytes that aren't covered by these
+  // stores must not be overwritten. We therefore only vectorize memset.pattern
+  // if the store and alloc sizes of the SetValue are equal and properly divide
+  // the size of the preferred lowering type (and only if store and alloc size
+  // for the preferred lowering type are also equal).
+
+  unsigned MainLoopStep = 1;
+  Type *MainLoopType = OriginalType;
+  TypeSize MainLoopAllocSize = OriginalTypeAllocSize;
+  unsigned ResidualLoopStep = 0;
+  Type *ResidualLoopType = nullptr;
+
+  if (PreferredLoopOpStoreSize == PreferredLoopOpAllocSize &&
+      OriginalTypeStoreSize == OriginalTypeAllocSize &&
+      OriginalTypeStoreSize < PreferredLoopOpStoreSize &&
+      PreferredLoopOpStoreSize % OriginalTypeStoreSize == 0) {
+    // Multiple instances of SetValue can be combined to reach the preferred
+    // loop op size.
+    MainLoopStep = PreferredLoopOpStoreSize / OriginalTypeStoreSize;
+    MainLoopType = PreferredLoopOpType;
+    MainLoopAllocSize = PreferredLoopOpStoreSize;
+
+    ResidualLoopStep = 1;
+    ResidualLoopType = OriginalType;
+  }
+
+  // The step arguments here are in terms of the alloc size of the SetValue, not
+  // in terms of bytes.
+  LoopExpansionInfo LEI =
+      insertLoopExpansion(InsertBefore, Len, MainLoopStep, ResidualLoopStep,
+                          "memset.pattern", AverageTripCount);
+
+  Align PartDstAlign(commonAlignment(DstAlign, MainLoopAllocSize));
+
+  if (LEI.MainLoopIP) {
+    // Create the loop-invariant splat value before the loop.
+    IRBuilder<> PreLoopBuilder(PreLoopBB->getTerminator());
+    Value *MainLoopSetValue = SetValue;
+    if (MainLoopType != OriginalType)
+      MainLoopSetValue =
+          createMemSetSplat(DL, PreLoopBuilder, SetValue, MainLoopType);
+
+    // Fill MainLoopBB
+    IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
+    Value *DstGEP = MainLoopBuilder.CreateInBoundsGEP(MainLoopType, DstAddr,
+                                                      LEI.MainLoopIndex);
+    MainLoopBuilder.CreateAlignedStore(MainLoopSetValue, DstGEP, PartDstAlign,
+                                       IsVolatile);
+  }
+
+  if (!LEI.ResidualLoopIP)
+    return;
+
+  // Fill ResidualLoopBB
+  Align ResDstAlign(
+      commonAlignment(PartDstAlign, DL.getTypeAllocSize(ResidualLoopType)));
+
+  IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
+  Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(ResidualLoopType, DstAddr,
+                                                      LEI.ResidualLoopIndex);
+  ResLoopBuilder.CreateAlignedStore(SetValue, ResDstGEP, ResDstAlign,
+                                    IsVolatile);
 }
 
 template <typename T>
@@ -1501,14 +1542,22 @@ void llvm::expandMemSetAsLoop(MemSetInst *MemSet,
   expandMemSetAsLoop(MemSet, &TTI);
 }
 
-void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset) {
-  createMemSetLoop(/*InsertBefore=*/Memset,
-                   /*DstAddr=*/Memset->getRawDest(),
-                   /*CopyLen=*/Memset->getLength(),
-                   /*SetValue=*/Memset->getValue(),
-                   /*DstAlign=*/Memset->getDestAlign().valueOrOne(),
-                   /*AverageTripCount=*/getAverageMemOpLoopTripCount(*Memset),
-                   /*IsVolatile=*/Memset->isVolatile());
+void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset,
+                                     const TargetTransformInfo *TTI) {
+  createMemSetPatternLoop(
+      /*InsertBefore=*/Memset,
+      /*DstAddr=*/Memset->getRawDest(),
+      /*Len=*/Memset->getLength(),
+      /*SetValue=*/Memset->getValue(),
+      /*DstAlign=*/Memset->getDestAlign().valueOrOne(),
+      /*IsVolatile=*/Memset->isVolatile(),
+      /*TTI=*/TTI,
+      /*AverageTripCount=*/getAverageMemOpLoopTripCount(*Memset));
+}
+
+void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *MemSet,
+                                     const TargetTransformInfo &TTI) {
+  expandMemSetPatternAsLoop(MemSet, &TTI);
 }
 
 void llvm::expandAtomicMemCpyAsLoop(AnyMemCpyInst *AtomicMemcpy,
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
index a57a4a38b1cbd..09710860743d0 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
@@ -1724,15 +1724,45 @@ define void @memset_pattern_known(ptr addrspace(7) inreg %ptr) {
 ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
 ; CHECK-NEXT:    [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK-NEXT:    br label %[[LOADSTORELOOP:.*]]
 ; CHECK:       [[LOADSTORELOOP]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
-; CHECK-NEXT:    [[DOTIDX:%.*]] = mul nsw i32 [[TMP1]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]]
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 4 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
-; CHECK-NEXT:    [[TMP3]] = add i32 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 8192
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    [[DOTIDX1:%.*]] = mul nsw i32 [[LOOP_INDEX]], 256
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX1]]
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[TMP5]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_4:%.*]] = add nuw i32 [[TMP5]], 16
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_8:%.*]] = add nuw i32 [[TMP5]], 32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_12:%.*]] = add nuw i32 [[TMP5]], 48
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_16:%.*]] = add nuw i32 [[TMP5]], 64
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_20:%.*]] = add nuw i32 [[TMP5]], 80
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_24:%.*]] = add nuw i32 [[TMP5]], 96
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_28:%.*]] = add nuw i32 [[TMP5]], 112
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_32:%.*]] = add nuw i32 [[TMP5]], 128
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_36:%.*]] = add nuw i32 [[TMP5]], 144
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_40:%.*]] = add nuw i32 [[TMP5]], 160
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_44:%.*]] = add nuw i32 [[TMP5]], 176
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_48:%.*]] = add nuw i32 [[TMP5]], 192
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_52:%.*]] = add nuw i32 [[TMP5]], 208
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_56:%.*]] = add nuw i32 [[TMP5]], 224
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_60:%.*]] = add nuw i32 [[TMP5]], 240
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT:    [[TMP4]] = add i32 [[LOOP_INDEX]], 64
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP4]], 8192
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[LOADSTORELOOP]], label %[[SPLIT:.*]]
 ; CHECK:       [[SPLIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1745,15 +1775,15 @@ define void @memset_pattern_known_small(ptr addrspace(7) inreg %ptr) {
 ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
 ; CHECK-NEXT:    [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
-; CHECK:       [[LOADSTORELOOP]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    br label %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:.*]]
+; CHECK:       [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
 ; CHECK-NEXT:    [[DOTIDX:%.*]] = mul nsw i32 [[TMP1]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]]
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP3]] = add i32 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label %[[SPLIT:.*]]
 ; CHECK:       [[SPLIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1766,16 +1796,46 @@ define void @memset_pattern_known_i64(ptr addrspace(7) inreg %ptr) {
 ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
 ; CHECK-NEXT:    [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK-NEXT:    br label %[[LOADSTORELOOP:.*]]
 ; CHECK:       [[LOADSTORELOOP]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[LOADSTORELOOP]] ]
 ; CHECK-NEXT:    [[DOTC:%.*]] = trunc i64 [[TMP1]] to i32
-; CHECK-NEXT:    [[DOTIDX:%.*]] = mul nsw i32 [[DOTC]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]]
-; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
-; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8192
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK-NEXT:    [[DOTIDX1:%.*]] = mul nsw i32 [[DOTC]], 256
+; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX1]]
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP8]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_4:%.*]] = add nuw i32 [[TMP8]], 16
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_8:%.*]] = add nuw i32 [[TMP8]], 32
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_12:%.*]] = add nuw i32 [[TMP8]], 48
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_16:%.*]] = add nuw i32 [[TMP8]], 64
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_20:%.*]] = add nuw i32 [[TMP8]], 80
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_24:%.*]] = add nuw i32 [[TMP8]], 96
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_28:%.*]] = add nuw i32 [[TMP8]], 112
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_32:%.*]] = add nuw i32 [[TMP8]], 128
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_36:%.*]] = add nuw i32 [[TMP8]], 144
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_40:%.*]] = add nuw i32 [[TMP8]], 160
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_44:%.*]] = add nuw i32 [[TMP8]], 176
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_48:%.*]] = add nuw i32 [[TMP8]], 192
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_52:%.*]] = add nuw i32 [[TMP8]], 208
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_56:%.*]] = add nuw i32 [[TMP8]], 224
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
+; CHECK-NEXT:    [[DOTPART_60:%.*]] = add nuw i32 [[TMP8]], 240
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT:    [[TMP5]] = add i64 [[TMP1]], 64
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP5]], 8192
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[LOADSTORELOOP]], label %[[SPLIT:.*]]
 ; CHECK:       [[SPLIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1788,15 +1848,15 @@ define void @memset_pattern_known_i32_volatile(ptr addrspace(7) inreg %ptr) {
 ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
 ; CHECK-NEXT:    [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
-; CHECK:       [[LOADSTORELOOP]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    br label %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:.*]]
+; CHECK:       [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
 ; CHECK-NEXT:    [[DOTIDX:%.*]] = mul nsw i32 [[TMP1]], 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]]
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648)
 ; CHECK-NEXT:    [[TMP3]] = add i32 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label %[[SPLIT:.*]]
 ; CHECK:       [[SPLIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -1809,16 +1869,30 @@ define void @memset_pattern_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %leng
 ; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]], i32 inreg [[LENGTH:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
 ; CHECK-NEXT:    [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 0, [[LENGTH]]
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[LENGTH]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 [[LENGTH]], [[TMP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[LOADSTORELOOP:.*]], label %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND:.*]]
 ; CHECK:       [[LOADSTORELOOP]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT:    [[DOTIDX1:%.*]] = mul nsw i32 [[LOOP_INDEX]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX1]]
+; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP4]], i32 0, i32 0)
+; CHECK-NEXT:    [[TMP5]] = add i32 [[LOOP_INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[LOADSTORELOOP]], label %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND]]
+; CHECK:       [[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:.*]], label %[[SPLIT:.*]]
+; CHECK:       [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]]:
+; CHECK-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND]] ], [ [[TMP10:%.*]], %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP8]], [[RESIDUAL_LOOP_INDEX]]
 ; CHECK-NEXT:    [[DOTIDX:%.*]] = mul nsw i32 [[TMP2]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]]
 ; CHECK-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0)
-; CHECK-NEXT:    [[TMP4]] = add i32 [[TMP2]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[LENGTH]]
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK-NEXT:    [[TMP10]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label %[[SPLIT]]
 ; CHECK:       [[SPLIT]]:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/memset-pattern.ll b/llvm/test/CodeGen/AMDGPU/memset-pattern.ll
new file mode 100644
index 0000000000000..3a5f730721016
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memset-pattern.ll
@@ -0,0 +1,745 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 %s -o - | FileCheck -check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 %s -o - | FileCheck -check-prefixes=GFX942,GFX942-GISEL %s
+
+define void @memset_pattern_i128_len0_dynvalue(ptr addrspace(1) align 16 %a, i128 %value) {
+; GFX942-LABEL: memset_pattern_i128_len0_dynvalue:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.experimental.memset.pattern(ptr addrspace(1) align 16 %a, i128 %value, i64 0, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_len1_dynvalue(ptr addrspace(1) align 16 %a, i128 %value) {
+; GFX942-LABEL: memset_pattern_i128_len1_dynvalue:
+; GFX942:       ; %bb.0: ; %memset.pattern-expansion-residual-body
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.experimental.memset.pattern(ptr addrspace(1) align 16 %a, i128 %value, i64 1, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_len1(ptr addrspace(1) align 16 %a) {
+; GFX942-SDAG-LABEL: memset_pattern_i128_len1:
+; GFX942-SDAG:       ; %bb.0: ; %memset.pattern-expansion-residual-body
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0xdddddddd
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0xcccccccc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbbbbbbbb
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 0xaaaaaaaa
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_pattern_i128_len1:
+; GFX942-GISEL:       ; %bb.0: ; %memset.pattern-expansion-residual-body
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0xbbbbbbbb
+; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, 0xaaaaaaaa
+; GFX942-GISEL-NEXT:    s_mov_b32 s1, 0xcccccccc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.experimental.memset.pattern(ptr addrspace(1) align 16 %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_constlen_mainloop_and_residual_taken(ptr addrspace(1) align 16 %a) {
+; GFX942-SDAG-LABEL: memset_pattern_i128_constlen_mainloop_and_residual_taken:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_mov_b32 s5, 0xcccccccc
+; GFX942-SDAG-NEXT:    s_mov_b32 s4, 0xdddddddd
+; GFX942-SDAG-NEXT:    s_mov_b32 s7, 0xaaaaaaaa
+; GFX942-SDAG-NEXT:    s_mov_b32 s6, 0xbbbbbbbb
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0x1000
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-SDAG-NEXT:  .LBB3_1: ; %memset.pattern-expansion-main-body
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    s_add_u32 s0, s0, 16
+; GFX942-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:240
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:224
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:208
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:192
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:176
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:160
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:144
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:128
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:112
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:96
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:80
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:64
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, s[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GFX942-SDAG-NEXT:    s_cbranch_vccnz .LBB3_1
+; GFX942-SDAG-NEXT:  ; %bb.2: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0x100
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX942-SDAG-NEXT:    s_mov_b32 s0, 1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0xdddddddd
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 0xcccccccc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0xbbbbbbbb
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0xaaaaaaaa
+; GFX942-SDAG-NEXT:  .LBB3_3: ; %memset.pattern-expansion-residual-body
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    s_add_i32 s1, s0, 1
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX942-SDAG-NEXT:    s_cmp_lt_u32 s0, 3
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, 16
+; GFX942-SDAG-NEXT:    s_mov_b32 s0, s1
+; GFX942-SDAG-NEXT:    s_cbranch_scc1 .LBB3_3
+; GFX942-SDAG-NEXT:  ; %bb.4: ; %memset.pattern-post-expansion
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_pattern_i128_constlen_mainloop_and_residual_taken:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0xbbbbbbbb
+; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, 0xaaaaaaaa
+; GFX942-GISEL-NEXT:    s_mov_b32 s1, 0xcccccccc
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[4:5]
+; GFX942-GISEL-NEXT:  .LBB3_1: ; %memset.pattern-expansion-main-body
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, 16, v8
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[6:7], v[2:5], off
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[6:7], v[2:5], off offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[6:7], v[2:5], off offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[6:7], v[2:5], off offset:48
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[6:7], v[2:5], off offset:64
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[6:7], v[2:5], off offset:80
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[6:7], v[2:5], off offset:96
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[6:7], v[2:5], off offset:112
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[6:7], v[2:5], off offset:128
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[6:7], v[2:5], off offset:144
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[6:7], v[2:5], off offset:160
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[6:7], v[2:5], off offset:176
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[6:7], v[2:5], off offset:192
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[6:7], v[2:5], off offset:208
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[6:7], v[2:5], off offset:224
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[6:7], v[2:5], off offset:240
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v9, s[0:1], 0, v9, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 0x1000, v6
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; GFX942-GISEL-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_vccnz .LBB3_1
+; GFX942-GISEL-NEXT:  ; %bb.2: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, 0x100, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 s1, 0xcccccccc
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0xbbbbbbbb
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, 1
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, 0xaaaaaaaa
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, s4
+; GFX942-GISEL-NEXT:  .LBB3_3: ; %memset.pattern-expansion-residual-body
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX942-GISEL-NEXT:    v_add_co_u32_e64 v4, s[0:1], 16, v4
+; GFX942-GISEL-NEXT:    v_cmp_gt_u64_e32 vcc, 3, v[6:7]
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v6, 1, v6
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_vccnz .LBB3_3
+; GFX942-GISEL-NEXT:  ; %bb.4: ; %memset.pattern-post-expansion
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.experimental.memset.pattern(ptr addrspace(1) align 16 %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 19, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_len1_lds(ptr addrspace(3) align 16 %a) {
+; GFX942-SDAG-LABEL: memset_pattern_i128_len1_lds:
+; GFX942-SDAG:       ; %bb.0: ; %memset.pattern-expansion-residual-body
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0xdddddddd
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0xcccccccc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbbbbbbbb
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 0xaaaaaaaa
+; GFX942-SDAG-NEXT:    ds_write_b128 v0, v[2:5]
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_pattern_i128_len1_lds:
+; GFX942-GISEL:       ; %bb.0: ; %memset.pattern-expansion-residual-body
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0xbbbbbbbb
+; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, 0xaaaaaaaa
+; GFX942-GISEL-NEXT:    s_mov_b32 s1, 0xcccccccc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    ds_write_b128 v0, v[2:5]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.experimental.memset.pattern(ptr addrspace(3) align 16 %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_len1_no_align(ptr addrspace(1) %a) {
+; GFX942-SDAG-LABEL: memset_pattern_i128_len1_no_align:
+; GFX942-SDAG:       ; %bb.0: ; %memset.pattern-expansion-residual-body
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0xdddddddd
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0xcccccccc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbbbbbbbb
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 0xaaaaaaaa
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_pattern_i128_len1_no_align:
+; GFX942-GISEL:       ; %bb.0: ; %memset.pattern-expansion-residual-body
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0xbbbbbbbb
+; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, 0xaaaaaaaa
+; GFX942-GISEL-NEXT:    s_mov_b32 s1, 0xcccccccc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.experimental.memset.pattern(ptr addrspace(1) %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_len16(ptr addrspace(1) align 16 %a) {
+; GFX942-SDAG-LABEL: memset_pattern_i128_len16:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_mov_b32 s5, 0xcccccccc
+; GFX942-SDAG-NEXT:    s_mov_b32 s4, 0xdddddddd
+; GFX942-SDAG-NEXT:    s_mov_b32 s7, 0xaaaaaaaa
+; GFX942-SDAG-NEXT:    s_mov_b32 s6, 0xbbbbbbbb
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[6:7]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0x1000
+; GFX942-SDAG-NEXT:  .LBB6_1: ; %memset.pattern-expansion-main-body
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    s_add_u32 s0, s0, 16
+; GFX942-SDAG-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:240
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:224
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:208
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:192
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:176
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:160
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:144
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:128
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:112
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:96
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:80
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:64
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GFX942-SDAG-NEXT:    s_cbranch_vccnz .LBB6_1
+; GFX942-SDAG-NEXT:  ; %bb.2: ; %memset.pattern-post-expansion
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_pattern_i128_len16:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_mov_b32 s2, 0xbbbbbbbb
+; GFX942-GISEL-NEXT:    s_mov_b32 s0, 0xdddddddd
+; GFX942-GISEL-NEXT:    s_mov_b32 s3, 0xaaaaaaaa
+; GFX942-GISEL-NEXT:    s_mov_b32 s1, 0xcccccccc
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
+; GFX942-GISEL-NEXT:  .LBB6_1: ; %memset.pattern-expansion-main-body
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 16, v6
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:48
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:64
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:80
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:96
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:112
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:128
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:144
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:160
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:176
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:192
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:208
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:224
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off offset:240
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v7, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; GFX942-GISEL-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_vccnz .LBB6_1
+; GFX942-GISEL-NEXT:  ; %bb.2: ; %memset.pattern-post-expansion
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.experimental.memset.pattern(ptr addrspace(1) align 16 %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 16, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_dynlen(ptr addrspace(1) align 16 %a, i64 %len) {
+; GFX942-SDAG-LABEL: memset_pattern_i128_dynlen:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB7_3
+; GFX942-SDAG-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 0xdddddddd
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 0xcccccccc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, 0xbbbbbbbb
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, 0xaaaaaaaa
+; GFX942-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT:  .LBB7_2: ; %memset.pattern-expansion-main-body
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    s_add_u32 s4, s4, 1
+; GFX942-SDAG-NEXT:    s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[4:5], v[2:3]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 16
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX942-SDAG-NEXT:  .LBB7_3: ; %Flow1
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_pattern_i128_dynlen:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB7_3
+; GFX942-GISEL-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
+; GFX942-GISEL-NEXT:    s_mov_b32 s8, 0xdddddddd
+; GFX942-GISEL-NEXT:    s_mov_b32 s9, 0xcccccccc
+; GFX942-GISEL-NEXT:    s_mov_b32 s10, 0xbbbbbbbb
+; GFX942-GISEL-NEXT:    s_mov_b32 s11, 0xaaaaaaaa
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942-GISEL-NEXT:  .LBB7_2: ; %memset.pattern-expansion-main-body
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v8
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[4:7], off
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 16, v0
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e64 s[0:1], v[8:9], v[2:3]
+; GFX942-GISEL-NEXT:    s_or_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX942-GISEL-NEXT:  .LBB7_3: ; %Flow1
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.experimental.memset.pattern(ptr addrspace(1) %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 %len, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_dynlen_lds(ptr addrspace(3) align 16 %a, i64 %len) {
+; GFX942-SDAG-LABEL: memset_pattern_i128_dynlen_lds:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, v2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, v1
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB8_3
+; GFX942-SDAG-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0xdddddddd
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0xcccccccc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbbbbbbbb
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 0xaaaaaaaa
+; GFX942-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT:  .LBB8_2: ; %memset.pattern-expansion-main-body
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    s_add_u32 s4, s4, 1
+; GFX942-SDAG-NEXT:    s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[4:5], v[6:7]
+; GFX942-SDAG-NEXT:    ds_write_b128 v0, v[2:5]
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v0, 16, v0
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX942-SDAG-NEXT:  .LBB8_3: ; %Flow1
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_pattern_i128_dynlen_lds:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, v1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, v2
+; GFX942-GISEL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB8_3
+; GFX942-GISEL-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, 0xdddddddd
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, 0xcccccccc
+; GFX942-GISEL-NEXT:    s_mov_b32 s6, 0xbbbbbbbb
+; GFX942-GISEL-NEXT:    s_mov_b32 s7, 0xaaaaaaaa
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-GISEL-NEXT:  .LBB8_2: ; %memset.pattern-expansion-main-body
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v8
+; GFX942-GISEL-NEXT:    ds_write_b128 v0, v[2:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[8:9], v[6:7]
+; GFX942-GISEL-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, 16, v0
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX942-GISEL-NEXT:  .LBB8_3: ; %Flow1
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.experimental.memset.pattern(ptr addrspace(3) align 16 %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 %len, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i32_dynlen(ptr addrspace(1) align 16 %a, i64 %len) {
+; GFX942-SDAG-LABEL: memset_pattern_i32_dynlen:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v10, -4, v2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, v3
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v8, 3, v2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB9_3
+; GFX942-SDAG-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 0xaabbccdd
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, v4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], v[0:1]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT:  .LBB9_2: ; %memset.pattern-expansion-main-body
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    s_add_u32 s4, s4, 4
+; GFX942-SDAG-NEXT:    s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[4:5], v[10:11]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[12:13], v[4:7], off
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[12:13], v[12:13], 0, 64
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB9_2
+; GFX942-SDAG-NEXT:  .LBB9_3: ; %Flow6
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB9_6
+; GFX942-SDAG-NEXT:  ; %bb.4: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[2:3], 2, v[2:3]
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v2, -16, v2
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0xaabbccdd
+; GFX942-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT:  .LBB9_5: ; %memset.pattern-expansion-residual-body
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    s_add_u32 s4, s4, 1
+; GFX942-SDAG-NEXT:    s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
+; GFX942-SDAG-NEXT:    global_store_dword v[0:1], v2, off
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 4
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB9_5
+; GFX942-SDAG-NEXT:  .LBB9_6: ; %Flow4
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_pattern_i32_dynlen:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v8, 3, v2
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v10, vcc, v2, v8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subbrev_co_u32_e32 v11, vcc, 0, v3, vcc
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB9_3
+; GFX942-GISEL-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
+; GFX942-GISEL-NEXT:    s_mov_b32 s8, 0xaabbccdd
+; GFX942-GISEL-NEXT:    s_mov_b32 s9, s8
+; GFX942-GISEL-NEXT:    s_mov_b32 s10, s8
+; GFX942-GISEL-NEXT:    s_mov_b32 s11, s8
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
+; GFX942-GISEL-NEXT:  .LBB9_2: ; %memset.pattern-expansion-main-body
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 4, v14
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[12:13], v[4:7], off
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v15, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 64, v12
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e64 s[0:1], v[14:15], v[10:11]
+; GFX942-GISEL-NEXT:    s_or_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB9_2
+; GFX942-GISEL-NEXT:  .LBB9_3: ; %Flow6
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB9_6
+; GFX942-GISEL-NEXT:  ; %bb.4: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[2:3], 2, v[2:3]
+; GFX942-GISEL-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 4, v[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0xaabbccdd
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:  .LBB9_5: ; %memset.pattern-expansion-residual-body
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v2, vcc, 1, v2
+; GFX942-GISEL-NEXT:    global_store_dword v[0:1], v4, off
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 4, v0
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e64 s[0:1], v[2:3], v[8:9]
+; GFX942-GISEL-NEXT:    s_or_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB9_5
+; GFX942-GISEL-NEXT:  .LBB9_6: ; %Flow4
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.experimental.memset.pattern(ptr addrspace(1) align 16 %a, i32 u0xaabbccdd, i64 %len, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i32_dynval_dynlen(ptr addrspace(1) align 16 %a, i32 %val, i64 %len) {
+; GFX942-SDAG-LABEL: memset_pattern_i32_dynval_dynlen:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v3
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v10, -4, v8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, v4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, v4
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v6, 3, v8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB10_3
+; GFX942-SDAG-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[12:13], v[0:1]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT:  .LBB10_2: ; %memset.pattern-expansion-main-body
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    s_add_u32 s4, s4, 4
+; GFX942-SDAG-NEXT:    s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[4:5], v[10:11]
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[12:13], v[2:5], off
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[12:13], v[12:13], 0, 64
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB10_2
+; GFX942-SDAG-NEXT:  .LBB10_3: ; %Flow6
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB10_6
+; GFX942-SDAG-NEXT:  ; %bb.4: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[4:5], 2, v[8:9]
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v4, -16, v4
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT:  .LBB10_5: ; %memset.pattern-expansion-residual-body
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    s_add_u32 s4, s4, 1
+; GFX942-SDAG-NEXT:    s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[4:5], v[6:7]
+; GFX942-SDAG-NEXT:    global_store_dword v[0:1], v2, off
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 4
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB10_5
+; GFX942-SDAG-NEXT:  .LBB10_6: ; %Flow4
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_pattern_i32_dynval_dynlen:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, v3
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v6, 3, v8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v9, v4
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v10, vcc, v8, v6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subbrev_co_u32_e32 v11, vcc, 0, v9, vcc
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB10_3
+; GFX942-GISEL-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
+; GFX942-GISEL-NEXT:  .LBB10_2: ; %memset.pattern-expansion-main-body
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 4, v14
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[12:13], v[2:5], off
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v15, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 64, v12
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e64 s[0:1], v[14:15], v[10:11]
+; GFX942-GISEL-NEXT:    s_or_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB10_2
+; GFX942-GISEL-NEXT:  .LBB10_3: ; %Flow6
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB10_6
+; GFX942-GISEL-NEXT:  ; %bb.4: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[4:5], 2, v[8:9]
+; GFX942-GISEL-NEXT:    v_lshl_add_u64 v[0:1], v[4:5], 4, v[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
+; GFX942-GISEL-NEXT:  .LBB10_5: ; %memset.pattern-expansion-residual-body
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, 1, v4
+; GFX942-GISEL-NEXT:    global_store_dword v[0:1], v2, off
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 4, v0
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e64 s[0:1], v[4:5], v[6:7]
+; GFX942-GISEL-NEXT:    s_or_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB10_5
+; GFX942-GISEL-NEXT:  .LBB10_6: ; %Flow4
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.experimental.memset.pattern(ptr addrspace(1) align 16 %a, i32 %val, i64 %len, i1 false)
+  ret void
+}
+
+; For i96, the store size and the alloc size differ on amdgpu, this case is not
+; optimized.
+define void @memset_pattern_i96_dynval_dynlen(ptr addrspace(1) align 16 %a, i96 %val, i64 %len) {
+; GFX942-SDAG-LABEL: memset_pattern_i96_dynval_dynlen:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, v6
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, v5
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB11_3
+; GFX942-SDAG-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
+; GFX942-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT:  .LBB11_2: ; %memset.pattern-expansion-main-body
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    s_add_u32 s4, s4, 1
+; GFX942-SDAG-NEXT:    s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[4:5], v[6:7]
+; GFX942-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 16
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB11_2
+; GFX942-SDAG-NEXT:  .LBB11_3: ; %Flow1
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_pattern_i96_dynval_dynlen:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, v5
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v9, v6
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB11_3
+; GFX942-GISEL-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[2:3]
+; GFX942-GISEL-NEXT:  .LBB11_2: ; %memset.pattern-expansion-main-body
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v6
+; GFX942-GISEL-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 16, v0
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e64 s[0:1], v[6:7], v[8:9]
+; GFX942-GISEL-NEXT:    s_or_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB11_2
+; GFX942-GISEL-NEXT:  .LBB11_3: ; %Flow1
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.experimental.memset.pattern(ptr addrspace(1) align 16 %a, i96 %val, i64 %len, i1 false)
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/RISCV/memset-pattern.ll b/llvm/test/CodeGen/RISCV/memset-pattern.ll
index 35ce7fad0ea67..cfc61144634b1 100644
--- a/llvm/test/CodeGen/RISCV/memset-pattern.ll
+++ b/llvm/test/CodeGen/RISCV/memset-pattern.ll
@@ -13,14 +13,14 @@
 
 define void @memset_1(ptr %a, i128 %value) nounwind {
 ; RV32-BOTH-LABEL: memset_1:
-; RV32-BOTH:       # %bb.0: # %loadstoreloop.preheader
+; RV32-BOTH:       # %bb.0:
 ; RV32-BOTH-NEXT:    li a2, 0
 ; RV32-BOTH-NEXT:    lw a3, 0(a1)
 ; RV32-BOTH-NEXT:    lw a4, 4(a1)
 ; RV32-BOTH-NEXT:    lw a5, 8(a1)
 ; RV32-BOTH-NEXT:    lw a1, 12(a1)
 ; RV32-BOTH-NEXT:    li a6, 0
-; RV32-BOTH-NEXT:  .LBB0_1: # %loadstoreloop
+; RV32-BOTH-NEXT:  .LBB0_1: # %memset.pattern-expansion-main-body
 ; RV32-BOTH-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-BOTH-NEXT:    slli a7, a2, 4
 ; RV32-BOTH-NEXT:    addi a2, a2, 1
@@ -33,19 +33,19 @@ define void @memset_1(ptr %a, i128 %value) nounwind {
 ; RV32-BOTH-NEXT:    sw a5, 8(a7)
 ; RV32-BOTH-NEXT:    sw a1, 12(a7)
 ; RV32-BOTH-NEXT:    beqz t0, .LBB0_1
-; RV32-BOTH-NEXT:  # %bb.2: # %split
+; RV32-BOTH-NEXT:  # %bb.2: # %memset.pattern-post-expansion
 ; RV32-BOTH-NEXT:    ret
 ;
 ; RV64-BOTH-LABEL: memset_1:
-; RV64-BOTH:       # %bb.0: # %loadstoreloop.preheader
+; RV64-BOTH:       # %bb.0:
 ; RV64-BOTH-NEXT:    addi a3, a0, 16
-; RV64-BOTH-NEXT:  .LBB0_1: # %loadstoreloop
+; RV64-BOTH-NEXT:  .LBB0_1: # %memset.pattern-expansion-main-body
 ; RV64-BOTH-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64-BOTH-NEXT:    sd a1, 0(a0)
 ; RV64-BOTH-NEXT:    sd a2, 8(a0)
 ; RV64-BOTH-NEXT:    addi a0, a0, 16
 ; RV64-BOTH-NEXT:    bne a0, a3, .LBB0_1
-; RV64-BOTH-NEXT:  # %bb.2: # %split
+; RV64-BOTH-NEXT:  # %bb.2: # %memset.pattern-post-expansion
 ; RV64-BOTH-NEXT:    ret
   tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 1, i1 0)
   ret void
@@ -53,7 +53,7 @@ define void @memset_1(ptr %a, i128 %value) nounwind {
 
 define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
 ; RV32-LABEL: memset_1_noalign:
-; RV32:       # %bb.0: # %loadstoreloop.preheader
+; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -32
 ; RV32-NEXT:    sw s0, 28(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s1, 24(sp) # 4-byte Folded Spill
@@ -79,7 +79,7 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
 ; RV32-NEXT:    srli s1, a1, 24
 ; RV32-NEXT:    srli s2, a1, 16
 ; RV32-NEXT:    srli s3, a1, 8
-; RV32-NEXT:  .LBB1_1: # %loadstoreloop
+; RV32-NEXT:  .LBB1_1: # %memset.pattern-expansion-main-body
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    slli s4, a2, 4
 ; RV32-NEXT:    addi a2, a2, 1
@@ -104,7 +104,7 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
 ; RV32-NEXT:    sb s2, 14(s4)
 ; RV32-NEXT:    sb s1, 15(s4)
 ; RV32-NEXT:    beqz s5, .LBB1_1
-; RV32-NEXT:  # %bb.2: # %split
+; RV32-NEXT:  # %bb.2: # %memset.pattern-post-expansion
 ; RV32-NEXT:    lw s0, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 24(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s2, 20(sp) # 4-byte Folded Reload
@@ -115,7 +115,7 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: memset_1_noalign:
-; RV64:       # %bb.0: # %loadstoreloop.preheader
+; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -32
 ; RV64-NEXT:    sd s0, 24(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s1, 16(sp) # 8-byte Folded Spill
@@ -135,7 +135,7 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
 ; RV64-NEXT:    srli s0, a2, 24
 ; RV64-NEXT:    srli s1, a2, 16
 ; RV64-NEXT:    srli s2, a2, 8
-; RV64-NEXT:  .LBB1_1: # %loadstoreloop
+; RV64-NEXT:  .LBB1_1: # %memset.pattern-expansion-main-body
 ; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64-NEXT:    sb a7, 4(a0)
 ; RV64-NEXT:    sb a6, 5(a0)
@@ -155,7 +155,7 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
 ; RV64-NEXT:    sb s0, 11(a0)
 ; RV64-NEXT:    addi a0, a0, 16
 ; RV64-NEXT:    bne a0, a3, .LBB1_1
-; RV64-NEXT:  # %bb.2: # %split
+; RV64-NEXT:  # %bb.2: # %memset.pattern-post-expansion
 ; RV64-NEXT:    ld s0, 24(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s1, 16(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s2, 8(sp) # 8-byte Folded Reload
@@ -163,14 +163,14 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
 ; RV64-NEXT:    ret
 ;
 ; RV32-FAST-LABEL: memset_1_noalign:
-; RV32-FAST:       # %bb.0: # %loadstoreloop.preheader
+; RV32-FAST:       # %bb.0:
 ; RV32-FAST-NEXT:    li a2, 0
 ; RV32-FAST-NEXT:    lw a3, 0(a1)
 ; RV32-FAST-NEXT:    lw a4, 4(a1)
 ; RV32-FAST-NEXT:    lw a5, 8(a1)
 ; RV32-FAST-NEXT:    lw a1, 12(a1)
 ; RV32-FAST-NEXT:    li a6, 0
-; RV32-FAST-NEXT:  .LBB1_1: # %loadstoreloop
+; RV32-FAST-NEXT:  .LBB1_1: # %memset.pattern-expansion-main-body
 ; RV32-FAST-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-FAST-NEXT:    slli a7, a2, 4
 ; RV32-FAST-NEXT:    addi a2, a2, 1
@@ -183,19 +183,19 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
 ; RV32-FAST-NEXT:    sw a5, 8(a7)
 ; RV32-FAST-NEXT:    sw a1, 12(a7)
 ; RV32-FAST-NEXT:    beqz t0, .LBB1_1
-; RV32-FAST-NEXT:  # %bb.2: # %split
+; RV32-FAST-NEXT:  # %bb.2: # %memset.pattern-post-expansion
 ; RV32-FAST-NEXT:    ret
 ;
 ; RV64-FAST-LABEL: memset_1_noalign:
-; RV64-FAST:       # %bb.0: # %loadstoreloop.preheader
+; RV64-FAST:       # %bb.0:
 ; RV64-FAST-NEXT:    addi a3, a0, 16
-; RV64-FAST-NEXT:  .LBB1_1: # %loadstoreloop
+; RV64-FAST-NEXT:  .LBB1_1: # %memset.pattern-expansion-main-body
 ; RV64-FAST-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64-FAST-NEXT:    sd a1, 0(a0)
 ; RV64-FAST-NEXT:    sd a2, 8(a0)
 ; RV64-FAST-NEXT:    addi a0, a0, 16
 ; RV64-FAST-NEXT:    bne a0, a3, .LBB1_1
-; RV64-FAST-NEXT:  # %bb.2: # %split
+; RV64-FAST-NEXT:  # %bb.2: # %memset.pattern-post-expansion
 ; RV64-FAST-NEXT:    ret
   tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 1, i1 0)
   ret void
@@ -203,14 +203,14 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
 
 define void @memset_4(ptr %a, i128 %value) nounwind {
 ; RV32-BOTH-LABEL: memset_4:
-; RV32-BOTH:       # %bb.0: # %loadstoreloop.preheader
+; RV32-BOTH:       # %bb.0:
 ; RV32-BOTH-NEXT:    li a2, 0
 ; RV32-BOTH-NEXT:    lw a3, 0(a1)
 ; RV32-BOTH-NEXT:    lw a4, 4(a1)
 ; RV32-BOTH-NEXT:    lw a5, 8(a1)
 ; RV32-BOTH-NEXT:    lw a1, 12(a1)
 ; RV32-BOTH-NEXT:    li a6, 0
-; RV32-BOTH-NEXT:  .LBB2_1: # %loadstoreloop
+; RV32-BOTH-NEXT:  .LBB2_1: # %memset.pattern-expansion-main-body
 ; RV32-BOTH-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-BOTH-NEXT:    slli a7, a2, 4
 ; RV32-BOTH-NEXT:    addi a2, a2, 1
@@ -225,19 +225,19 @@ define void @memset_4(ptr %a, i128 %value) nounwind {
 ; RV32-BOTH-NEXT:    sw a5, 8(a7)
 ; RV32-BOTH-NEXT:    sw a1, 12(a7)
 ; RV32-BOTH-NEXT:    bnez t0, .LBB2_1
-; RV32-BOTH-NEXT:  # %bb.2: # %split
+; RV32-BOTH-NEXT:  # %bb.2: # %memset.pattern-post-expansion
 ; RV32-BOTH-NEXT:    ret
 ;
 ; RV64-BOTH-LABEL: memset_4:
-; RV64-BOTH:       # %bb.0: # %loadstoreloop.preheader
+; RV64-BOTH:       # %bb.0:
 ; RV64-BOTH-NEXT:    addi a3, a0, 64
-; RV64-BOTH-NEXT:  .LBB2_1: # %loadstoreloop
+; RV64-BOTH-NEXT:  .LBB2_1: # %memset.pattern-expansion-main-body
 ; RV64-BOTH-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64-BOTH-NEXT:    sd a1, 0(a0)
 ; RV64-BOTH-NEXT:    sd a2, 8(a0)
 ; RV64-BOTH-NEXT:    addi a0, a0, 16
 ; RV64-BOTH-NEXT:    bne a0, a3, .LBB2_1
-; RV64-BOTH-NEXT:  # %bb.2: # %split
+; RV64-BOTH-NEXT:  # %bb.2: # %memset.pattern-post-expansion
 ; RV64-BOTH-NEXT:    ret
   tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 4, i1 0)
   ret void
@@ -248,7 +248,7 @@ define void @memset_x(ptr %a, i128 %value, i64 %x) nounwind {
 ; RV32-BOTH:       # %bb.0:
 ; RV32-BOTH-NEXT:    or a4, a2, a3
 ; RV32-BOTH-NEXT:    beqz a4, .LBB3_5
-; RV32-BOTH-NEXT:  # %bb.1: # %loadstoreloop.preheader
+; RV32-BOTH-NEXT:  # %bb.1: # %memset.pattern-expansion-main-body.preheader
 ; RV32-BOTH-NEXT:    li a4, 0
 ; RV32-BOTH-NEXT:    lw a5, 0(a1)
 ; RV32-BOTH-NEXT:    lw a6, 4(a1)
@@ -256,11 +256,11 @@ define void @memset_x(ptr %a, i128 %value, i64 %x) nounwind {
 ; RV32-BOTH-NEXT:    lw a1, 12(a1)
 ; RV32-BOTH-NEXT:    li t0, 0
 ; RV32-BOTH-NEXT:    j .LBB3_3
-; RV32-BOTH-NEXT:  .LBB3_2: # %loadstoreloop
+; RV32-BOTH-NEXT:  .LBB3_2: # %memset.pattern-expansion-main-body
 ; RV32-BOTH-NEXT:    # in Loop: Header=BB3_3 Depth=1
 ; RV32-BOTH-NEXT:    sltu t1, t0, a3
 ; RV32-BOTH-NEXT:    beqz t1, .LBB3_5
-; RV32-BOTH-NEXT:  .LBB3_3: # %loadstoreloop
+; RV32-BOTH-NEXT:  .LBB3_3: # %memset.pattern-expansion-main-body
 ; RV32-BOTH-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-BOTH-NEXT:    slli t1, a4, 4
 ; RV32-BOTH-NEXT:    addi a4, a4, 1
@@ -275,22 +275,22 @@ define void @memset_x(ptr %a, i128 %value, i64 %x) nounwind {
 ; RV32-BOTH-NEXT:  # %bb.4: # in Loop: Header=BB3_3 Depth=1
 ; RV32-BOTH-NEXT:    sltu t1, a4, a2
 ; RV32-BOTH-NEXT:    bnez t1, .LBB3_3
-; RV32-BOTH-NEXT:  .LBB3_5: # %split
+; RV32-BOTH-NEXT:  .LBB3_5: # %memset.pattern-post-expansion
 ; RV32-BOTH-NEXT:    ret
 ;
 ; RV64-BOTH-LABEL: memset_x:
 ; RV64-BOTH:       # %bb.0:
 ; RV64-BOTH-NEXT:    beqz a3, .LBB3_3
-; RV64-BOTH-NEXT:  # %bb.1: # %loadstoreloop.preheader
+; RV64-BOTH-NEXT:  # %bb.1: # %memset.pattern-expansion-main-body.preheader
 ; RV64-BOTH-NEXT:    li a4, 0
-; RV64-BOTH-NEXT:  .LBB3_2: # %loadstoreloop
+; RV64-BOTH-NEXT:  .LBB3_2: # %memset.pattern-expansion-main-body
 ; RV64-BOTH-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64-BOTH-NEXT:    sd a1, 0(a0)
 ; RV64-BOTH-NEXT:    sd a2, 8(a0)
 ; RV64-BOTH-NEXT:    addi a4, a4, 1
 ; RV64-BOTH-NEXT:    addi a0, a0, 16
 ; RV64-BOTH-NEXT:    bltu a4, a3, .LBB3_2
-; RV64-BOTH-NEXT:  .LBB3_3: # %split
+; RV64-BOTH-NEXT:  .LBB3_3: # %memset.pattern-post-expansion
 ; RV64-BOTH-NEXT:    ret
   tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 %x, i1 0)
   ret void
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AMDGPU/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/AMDGPU/memset-pattern.ll
new file mode 100644
index 0000000000000..e79e330190722
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AMDGPU/memset-pattern.ll
@@ -0,0 +1,273 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -p=pre-isel-intrinsic-lowering -S < %s | FileCheck -check-prefixes=CHECK,DEFAULT %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -p=pre-isel-intrinsic-lowering -S -amdgpu-memcpy-loop-unroll=2 %s -o - | FileCheck -check-prefixes=CHECK,UNROLL2 %s
+
+define void @memset_pattern_i128_len0_dynvalue(ptr align 16 %a, i128 %value) {
+; CHECK-LABEL: @memset_pattern_i128_len0_dynvalue(
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.experimental.memset.pattern(ptr align 16 %a, i128 %value, i64 0, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_len1_dynvalue(ptr align 16 %a, i128 %value) {
+; CHECK-LABEL: @memset_pattern_i128_len1_dynvalue(
+; CHECK-NEXT:    br label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:%.*]]
+; CHECK:       memset.pattern-expansion-residual-body:
+; CHECK-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i128, ptr [[A:%.*]], i64 [[RESIDUAL_LOOP_INDEX]]
+; CHECK-NEXT:    store i128 [[VALUE:%.*]], ptr [[TMP1]], align 16
+; CHECK-NEXT:    [[TMP2]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 1
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
+; CHECK:       memset.pattern-post-expansion:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.experimental.memset.pattern(ptr align 16 %a, i128 %value, i64 1, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_len1(ptr align 16 %a) {
+; CHECK-LABEL: @memset_pattern_i128_len1(
+; CHECK-NEXT:    br label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:%.*]]
+; CHECK:       memset.pattern-expansion-residual-body:
+; CHECK-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i128, ptr [[A:%.*]], i64 [[RESIDUAL_LOOP_INDEX]]
+; CHECK-NEXT:    store i128 -113427455635030943652277463699152839203, ptr [[TMP1]], align 16
+; CHECK-NEXT:    [[TMP2]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 1
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
+; CHECK:       memset.pattern-post-expansion:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.experimental.memset.pattern(ptr align 16 %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_constlen_mainloop_and_residual_taken(ptr align 16 %a) {
+; DEFAULT-LABEL: @memset_pattern_i128_constlen_mainloop_and_residual_taken(
+; DEFAULT-NEXT:    br label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]]
+; DEFAULT:       memset.pattern-expansion-main-body:
+; DEFAULT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
+; DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <64 x i32>, ptr [[A:%.*]], i64 [[LOOP_INDEX]]
+; DEFAULT-NEXT:    store <64 x i32> bitcast (<16 x i128> splat (i128 -113427455635030943652277463699152839203) to <64 x i32>), ptr [[TMP1]], align 16
+; DEFAULT-NEXT:    [[TMP2]] = add i64 [[LOOP_INDEX]], 16
+; DEFAULT-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16
+; DEFAULT-NEXT:    br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:%.*]]
+; DEFAULT:       memset.pattern-expansion-residual-body:
+; DEFAULT-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ], [ [[TMP6:%.*]], [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = add i64 16, [[RESIDUAL_LOOP_INDEX]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP4]]
+; DEFAULT-NEXT:    store i128 -113427455635030943652277463699152839203, ptr [[TMP5]], align 16
+; DEFAULT-NEXT:    [[TMP6]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
+; DEFAULT-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 3
+; DEFAULT-NEXT:    br i1 [[TMP7]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
+; DEFAULT:       memset.pattern-post-expansion:
+; DEFAULT-NEXT:    ret void
+;
+; UNROLL2-LABEL: @memset_pattern_i128_constlen_mainloop_and_residual_taken(
+; UNROLL2-NEXT:    br label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]]
+; UNROLL2:       memset.pattern-expansion-main-body:
+; UNROLL2-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
+; UNROLL2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <8 x i32>, ptr [[A:%.*]], i64 [[LOOP_INDEX]]
+; UNROLL2-NEXT:    store <8 x i32> bitcast (<2 x i128> splat (i128 -113427455635030943652277463699152839203) to <8 x i32>), ptr [[TMP1]], align 16
+; UNROLL2-NEXT:    [[TMP2]] = add i64 [[LOOP_INDEX]], 2
+; UNROLL2-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 18
+; UNROLL2-NEXT:    br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
+; UNROLL2:       memset.pattern-expansion-residual-body:
+; UNROLL2-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ], [ [[TMP6:%.*]], [[MEMSET_PATTERN_POST_EXPANSION]] ]
+; UNROLL2-NEXT:    [[TMP4:%.*]] = add i64 18, [[RESIDUAL_LOOP_INDEX]]
+; UNROLL2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP4]]
+; UNROLL2-NEXT:    store i128 -113427455635030943652277463699152839203, ptr [[TMP5]], align 16
+; UNROLL2-NEXT:    [[TMP6]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
+; UNROLL2-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 1
+; UNROLL2-NEXT:    br i1 [[TMP7]], label [[MEMSET_PATTERN_POST_EXPANSION]], label [[MEMSET_PATTERN_POST_EXPANSION1:%.*]]
+; UNROLL2:       memset.pattern-post-expansion:
+; UNROLL2-NEXT:    ret void
+;
+  call void @llvm.experimental.memset.pattern(ptr align 16 %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 19, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_len1_nz_as(ptr addrspace(3) align 16 %a) {
+; CHECK-LABEL: @memset_pattern_i128_len1_nz_as(
+; CHECK-NEXT:    br label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:%.*]]
+; CHECK:       memset.pattern-expansion-residual-body:
+; CHECK-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i128, ptr addrspace(3) [[A:%.*]], i64 [[RESIDUAL_LOOP_INDEX]]
+; CHECK-NEXT:    store i128 -113427455635030943652277463699152839203, ptr addrspace(3) [[TMP1]], align 16
+; CHECK-NEXT:    [[TMP2]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 1
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
+; CHECK:       memset.pattern-post-expansion:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.experimental.memset.pattern(ptr addrspace(3) align 16 %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_len1_no_align(ptr %a) {
+; CHECK-LABEL: @memset_pattern_i128_len1_no_align(
+; CHECK-NEXT:    br label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:%.*]]
+; CHECK:       memset.pattern-expansion-residual-body:
+; CHECK-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i128, ptr [[A:%.*]], i64 [[RESIDUAL_LOOP_INDEX]]
+; CHECK-NEXT:    store i128 -113427455635030943652277463699152839203, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 1
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
+; CHECK:       memset.pattern-post-expansion:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_len16(ptr align 16 %a) {
+; DEFAULT-LABEL: @memset_pattern_i128_len16(
+; DEFAULT-NEXT:    br label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]]
+; DEFAULT:       memset.pattern-expansion-main-body:
+; DEFAULT-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
+; DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <64 x i32>, ptr [[A:%.*]], i64 [[LOOP_INDEX]]
+; DEFAULT-NEXT:    store <64 x i32> bitcast (<16 x i128> splat (i128 -113427455635030943652277463699152839203) to <64 x i32>), ptr [[TMP1]], align 16
+; DEFAULT-NEXT:    [[TMP2]] = add i64 [[LOOP_INDEX]], 16
+; DEFAULT-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16
+; DEFAULT-NEXT:    br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
+; DEFAULT:       memset.pattern-post-expansion:
+; DEFAULT-NEXT:    ret void
+;
+; UNROLL2-LABEL: @memset_pattern_i128_len16(
+; UNROLL2-NEXT:    br label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]]
+; UNROLL2:       memset.pattern-expansion-main-body:
+; UNROLL2-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
+; UNROLL2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <8 x i32>, ptr [[A:%.*]], i64 [[LOOP_INDEX]]
+; UNROLL2-NEXT:    store <8 x i32> bitcast (<2 x i128> splat (i128 -113427455635030943652277463699152839203) to <8 x i32>), ptr [[TMP1]], align 16
+; UNROLL2-NEXT:    [[TMP2]] = add i64 [[LOOP_INDEX]], 2
+; UNROLL2-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16
+; UNROLL2-NEXT:    br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
+; UNROLL2:       memset.pattern-post-expansion:
+; UNROLL2-NEXT:    ret void
+;
+  call void @llvm.experimental.memset.pattern(ptr align 16 %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 16, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_dynlen(ptr align 16 %a, i64 %len) {
+; CHECK-LABEL: @memset_pattern_i128_dynlen(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[LEN:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
+; CHECK:       memset.pattern-expansion-main-body:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i128, ptr [[A:%.*]], i64 [[LOOP_INDEX]]
+; CHECK-NEXT:    store i128 -113427455635030943652277463699152839203, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3]] = add i64 [[LOOP_INDEX]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], [[LEN]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION]]
+; CHECK:       memset.pattern-post-expansion:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 %len, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i128_dynlen_nz_as(ptr addrspace(3) align 16 %a, i64 %len) {
+; CHECK-LABEL: @memset_pattern_i128_dynlen_nz_as(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[LEN:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
+; CHECK:       memset.pattern-expansion-main-body:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i128, ptr addrspace(3) [[A:%.*]], i64 [[LOOP_INDEX]]
+; CHECK-NEXT:    store i128 -113427455635030943652277463699152839203, ptr addrspace(3) [[TMP2]], align 16
+; CHECK-NEXT:    [[TMP3]] = add i64 [[LOOP_INDEX]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], [[LEN]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION]]
+; CHECK:       memset.pattern-post-expansion:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.experimental.memset.pattern(ptr addrspace(3) align 16 %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 %len, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i32_dynlen(ptr align 16 %a, i64 %len) {
+; CHECK-LABEL: @memset_pattern_i32_dynlen(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[LEN:%.*]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[LEN]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND:%.*]]
+; CHECK:       memset.pattern-expansion-main-body:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[A:%.*]], i64 [[LOOP_INDEX]]
+; CHECK-NEXT:    store <4 x i32> splat (i32 -1430532899), ptr [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP5]] = add i64 [[LOOP_INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND]]
+; CHECK:       memset.pattern-expansion-residual-cond:
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:%.*]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
+; CHECK:       memset.pattern-expansion-residual-body:
+; CHECK-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND]] ], [ [[TMP10:%.*]], [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP2]], [[RESIDUAL_LOOP_INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    store i32 -1430532899, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION]]
+; CHECK:       memset.pattern-post-expansion:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.experimental.memset.pattern(ptr align 16 %a, i32 u0xaabbccdd, i64 %len, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i32_dynval_dynlen(ptr align 16 %a, i32 %val, i64 %len) {
+; CHECK-LABEL: @memset_pattern_i32_dynval_dynlen(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[LEN:%.*]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[LEN]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    [[SETVALUE_SPLAT_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[VAL:%.*]], i64 0
+; CHECK-NEXT:    [[SETVALUE_SPLAT_SPLAT:%.*]] = shufflevector <4 x i32> [[SETVALUE_SPLAT_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND:%.*]]
+; CHECK:       memset.pattern-expansion-main-body:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[A:%.*]], i64 [[LOOP_INDEX]]
+; CHECK-NEXT:    store <4 x i32> [[SETVALUE_SPLAT_SPLAT]], ptr [[TMP4]], align 16
+; CHECK-NEXT:    [[TMP5]] = add i64 [[LOOP_INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND]]
+; CHECK:       memset.pattern-expansion-residual-cond:
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:%.*]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
+; CHECK:       memset.pattern-expansion-residual-body:
+; CHECK-NEXT:    [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND]] ], [ [[TMP10:%.*]], [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP2]], [[RESIDUAL_LOOP_INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
+; CHECK-NEXT:    store i32 [[VAL]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION]]
+; CHECK:       memset.pattern-post-expansion:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.experimental.memset.pattern(ptr align 16 %a, i32 %val, i64 %len, i1 false)
+  ret void
+}
+
+; For i96, the store size and the alloc size differ on amdgpu, this case is not
+; optimized.
+define void @memset_pattern_i96_dynval_dynlen(ptr align 16 %a, i96 %val, i64 %len) {
+; CHECK-LABEL: @memset_pattern_i96_dynval_dynlen(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[LEN:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
+; CHECK:       memset.pattern-expansion-main-body:
+; CHECK-NEXT:    [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i96, ptr [[A:%.*]], i64 [[LOOP_INDEX]]
+; CHECK-NEXT:    store i96 [[VAL:%.*]], ptr [[TMP2]], align 16
+; CHECK-NEXT:    [[TMP3]] = add i64 [[LOOP_INDEX]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], [[LEN]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION]]
+; CHECK:       memset.pattern-post-expansion:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.experimental.memset.pattern(ptr align 16 %a, i96 %val, i64 %len, i1 false)
+  ret void
+}
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll
index 1f77c4a605106..e9a71769526d9 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/PowerPC/memset-pattern.ll
@@ -7,8 +7,8 @@
 define void @memset.pattern(ptr %a, i128 %value, i64 %x) nounwind {
 ; CHECK-LABEL: define void @memset.pattern(
 ; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 0, [[X]]
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[X]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
 ; CHECK:       [[LOADSTORELOOP]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP3]]
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll
index d3ef9fe4cefbd..ebdd8962f4b05 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/RISCV/memset-pattern.ll
@@ -4,14 +4,14 @@
 define void @memset_pattern_i128_1(ptr %a, i128 %value) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i128_1(
 ; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK-NEXT:    br label %[[LOADSTORELOOP:.*]]
 ; CHECK:       [[LOADSTORELOOP]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP2]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT:.*]]
 ; CHECK:       [[SPLIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -22,14 +22,14 @@ define void @memset_pattern_i128_1(ptr %a, i128 %value) nounwind {
 define void @memset_pattern_i128_16(ptr %a, i128 %value) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i128_16(
 ; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK-NEXT:    br label %[[LOADSTORELOOP:.*]]
 ; CHECK:       [[LOADSTORELOOP]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP1]], align 1
 ; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP2]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT:.*]]
 ; CHECK:       [[SPLIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -40,8 +40,8 @@ define void @memset_pattern_i128_16(ptr %a, i128 %value) nounwind {
 define void @memset_pattern_i127_x(ptr %a, i127 %value, i64 %x) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i127_x(
 ; CHECK-SAME: ptr [[A:%.*]], i127 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 0, [[X]]
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[X]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
 ; CHECK:       [[LOADSTORELOOP]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i127, ptr [[A]], i64 [[TMP3]]
@@ -59,8 +59,8 @@ define void @memset_pattern_i127_x(ptr %a, i127 %value, i64 %x) nounwind {
 define void @memset_pattern_i128_x(ptr %a, i128 %value, i64 %x) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i128_x(
 ; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 0, [[X]]
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[X]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
 ; CHECK:       [[LOADSTORELOOP]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[LOADSTORELOOP]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP2]]
@@ -78,8 +78,8 @@ define void @memset_pattern_i128_x(ptr %a, i128 %value, i64 %x) nounwind {
 define void @memset_pattern_i256_x(ptr %a, i256 %value, i64 %x) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i256_x(
 ; CHECK-SAME: ptr [[A:%.*]], i256 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 0, [[X]]
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[X]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
 ; CHECK:       [[LOADSTORELOOP]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[LOADSTORELOOP]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i256, ptr [[A]], i64 [[TMP2]]
@@ -99,8 +99,8 @@ define void @memset_pattern_i256_x(ptr %a, i256 %value, i64 %x) nounwind {
 define void @memset_pattern_i15_x_alignment(ptr %a, i15 %value, i64 %x) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i15_x_alignment(
 ; CHECK-SAME: ptr [[A:%.*]], i15 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 0, [[X]]
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[X]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
 ; CHECK:       [[LOADSTORELOOP]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i15, ptr [[A]], i64 [[TMP3]]
@@ -109,8 +109,8 @@ define void @memset_pattern_i15_x_alignment(ptr %a, i15 %value, i64 %x) nounwind
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[X]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
 ; CHECK:       [[SPLIT]]:
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 0, [[X]]
-; CHECK-NEXT:    br i1 [[TMP7]], label %[[SPLIT1:.*]], label %[[LOADSTORELOOP2:.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[X]], 0
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[LOADSTORELOOP2:.*]], label %[[SPLIT1:.*]]
 ; CHECK:       [[LOADSTORELOOP2]]:
 ; CHECK-NEXT:    [[TMP11:%.*]] = phi i64 [ 0, %[[SPLIT]] ], [ [[TMP9:%.*]], %[[LOADSTORELOOP2]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i15, ptr [[A]], i64 [[TMP11]]
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll
index 1a1fe20350885..ffedba6b6682d 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-pattern.ll
@@ -12,14 +12,14 @@
 define void @memset_pattern_i128_1_dynvalue(ptr %a, i128 %value) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i128_1_dynvalue(
 ; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK-NEXT:    br label %[[LOADSTORELOOP:.*]]
 ; CHECK:       [[LOADSTORELOOP]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP1]]
 ; CHECK-NEXT:    store i128 [[VALUE]], ptr [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT:.*]]
 ; CHECK:       [[SPLIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -40,14 +40,14 @@ define void @memset_pattern_i128_1(ptr %a, i128 %value) nounwind {
 define void @memset_pattern_i128_1_nz_as(ptr addrspace(1) %a, i128 %value) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i128_1_nz_as(
 ; CHECK-SAME: ptr addrspace(1) [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK-NEXT:    br label %[[LOADSTORELOOP:.*]]
 ; CHECK:       [[LOADSTORELOOP]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i128, ptr addrspace(1) [[A]], i64 [[TMP1]]
 ; CHECK-NEXT:    store i128 -113427455635030943652277463699152839203, ptr addrspace(1) [[TMP2]], align 1
 ; CHECK-NEXT:    [[TMP3]] = add i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT:.*]]
 ; CHECK:       [[SPLIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -89,8 +89,8 @@ define void @memset_pattern_i128_x(ptr %a, i64 %x) nounwind {
 define void @memset_pattern_i128_x_nonzero_as(ptr addrspace(10) %a, i64 %x) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i128_x_nonzero_as(
 ; CHECK-SAME: ptr addrspace(10) [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 0, [[X]]
-; CHECK-NEXT:    br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[X]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
 ; CHECK:       [[LOADSTORELOOP]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i128, ptr addrspace(10) [[A]], i64 [[TMP2]]
@@ -195,8 +195,8 @@ define void @memset_pattern_i64_x_fromptr(ptr %a, i64 %x) nounwind {
 define void @memset_pattern_i64_x_fromnonconstptr(ptr %a, i64 %x, ptr %p) nounwind {
 ; CHECK-LABEL: define void @memset_pattern_i64_x_fromnonconstptr(
 ; CHECK-SAME: ptr [[A:%.*]], i64 [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 0, [[X]]
-; CHECK-NEXT:    br i1 [[TMP2]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[X]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
 ; CHECK:       [[LOADSTORELOOP]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[LOADSTORELOOP]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP3]]

>From 3a8c16f7ae600268096c5121ae5545f3a355c3b5 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Thu, 12 Mar 2026 08:10:49 -0400
Subject: [PATCH 2/3] Add AS7 tests

---
 llvm/test/CodeGen/AMDGPU/memset-pattern.ll | 1066 ++++++++++++++++++++
 1 file changed, 1066 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/memset-pattern.ll b/llvm/test/CodeGen/AMDGPU/memset-pattern.ll
index 3a5f730721016..cec1b86d4ad94 100644
--- a/llvm/test/CodeGen/AMDGPU/memset-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/memset-pattern.ll
@@ -743,3 +743,1069 @@ define void @memset_pattern_i96_dynval_dynlen(ptr addrspace(1) align 16 %a, i96
   ret void
 }
 
+define void @memset_pattern_i64_as7_dynlen(ptr addrspace(7) align 16 %a, i64 %len) {
+; GFX942-SDAG-LABEL: memset_pattern_i64_as7_dynlen:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v12, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v15, v6
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v14, -2, v12
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v10, 1, v12
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, 0
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB12_5
+; GFX942-SDAG-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, 0xccccdddd
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, 0xaaaabbbb
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v6
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, v7
+; GFX942-SDAG-NEXT:    s_mov_b64 s[10:11], 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-SDAG-NEXT:  .LBB12_2: ; %memset.pattern-expansion-main-body
+; GFX942-SDAG-NEXT:    ; =>This Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    ; Child Loop BB12_3 Depth 2
+; GFX942-SDAG-NEXT:    s_mov_b64 s[12:13], exec
+; GFX942-SDAG-NEXT:  .LBB12_3: ; Parent Loop BB12_2 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[6:9], v5, s[4:7], 0 offen
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB12_3
+; GFX942-SDAG-NEXT:  ; %bb.4: ; in Loop: Header=BB12_2 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX942-SDAG-NEXT:    s_add_u32 s10, s10, 2
+; GFX942-SDAG-NEXT:    s_addc_u32 s11, s11, 0
+; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[10:11], v[14:15]
+; GFX942-SDAG-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB12_2
+; GFX942-SDAG-NEXT:  .LBB12_5: ; %Flow3
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB12_10
+; GFX942-SDAG-NEXT:  ; %bb.6: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 3, v12
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v5, -16, v5
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v6, v4, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 0xccccdddd
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 0xaaaabbbb
+; GFX942-SDAG-NEXT:    s_mov_b64 s[10:11], 0
+; GFX942-SDAG-NEXT:  .LBB12_7: ; %memset.pattern-expansion-residual-body
+; GFX942-SDAG-NEXT:    ; =>This Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    ; Child Loop BB12_8 Depth 2
+; GFX942-SDAG-NEXT:    s_mov_b64 s[12:13], exec
+; GFX942-SDAG-NEXT:  .LBB12_8: ; Parent Loop BB12_7 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx2 v[4:5], v6, s[4:7], 0 offen
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB12_8
+; GFX942-SDAG-NEXT:  ; %bb.9: ; in Loop: Header=BB12_7 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX942-SDAG-NEXT:    s_add_u32 s10, s10, 1
+; GFX942-SDAG-NEXT:    s_addc_u32 s11, s11, 0
+; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[10:11], v[10:11]
+; GFX942-SDAG-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v6, 8, v6
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB12_7
+; GFX942-SDAG-NEXT:  .LBB12_10: ; %Flow1
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_pattern_i64_as7_dynlen:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v12, v5
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v10, 1, v12
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v13, v6
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v14, vcc, v12, v10
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v11, 0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subbrev_co_u32_e32 v15, vcc, 0, v13, vcc
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB12_5
+; GFX942-GISEL-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, 0xccccdddd
+; GFX942-GISEL-NEXT:    s_mov_b32 s5, 0xaaaabbbb
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-GISEL-NEXT:  .LBB12_2: ; %memset.pattern-expansion-main-body
+; GFX942-GISEL-NEXT:    ; =>This Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    ; Child Loop BB12_3 Depth 2
+; GFX942-GISEL-NEXT:    s_mov_b64 s[10:11], exec
+; GFX942-GISEL-NEXT:  .LBB12_3: ; Parent Loop BB12_2 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[6:9], v5, s[4:7], 0 offen
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB12_3
+; GFX942-GISEL-NEXT:  ; %bb.4: ; in Loop: Header=BB12_2 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[10:11]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v16, vcc, 2, v16
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v17, vcc, 0, v17, vcc
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[16:17], v[14:15]
+; GFX942-GISEL-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB12_2
+; GFX942-GISEL-NEXT:  .LBB12_5: ; %Flow3
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v5, 1, v10
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB12_10
+; GFX942-GISEL-NEXT:  ; %bb.6: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[6:7], 1, v[12:13]
+; GFX942-GISEL-NEXT:    v_lshl_add_u32 v8, v6, 4, v4
+; GFX942-GISEL-NEXT:    s_mov_b64 s[8:9], 0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0xccccdddd
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, 0xaaaabbbb
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
+; GFX942-GISEL-NEXT:  .LBB12_7: ; %memset.pattern-expansion-residual-body
+; GFX942-GISEL-NEXT:    ; =>This Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    ; Child Loop BB12_8 Depth 2
+; GFX942-GISEL-NEXT:    s_mov_b64 s[10:11], exec
+; GFX942-GISEL-NEXT:  .LBB12_8: ; Parent Loop BB12_7 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx2 v[4:5], v8, s[4:7], 0 offen
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB12_8
+; GFX942-GISEL-NEXT:  ; %bb.9: ; in Loop: Header=BB12_7 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[10:11]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v6
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v8, 8, v8
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[10:11]
+; GFX942-GISEL-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB12_7
+; GFX942-GISEL-NEXT:  .LBB12_10: ; %Flow1
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.experimental.memset.pattern(ptr addrspace(7) %a, i64 u0xaaaabbbbccccdddd, i64 %len, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i64_as7_dynlen_dynval(ptr addrspace(7) align 16 %a, i64 %len, i64 %val) {
+; GFX942-SDAG-LABEL: memset_pattern_i64_as7_dynlen_dynval:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v13, v8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, v6
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v10, -2, v8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v12, v7
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v6, 1, v8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB13_5
+; GFX942-SDAG-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v14, v12
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v15, v13
+; GFX942-SDAG-NEXT:    s_mov_b64 s[10:11], 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-SDAG-NEXT:  .LBB13_2: ; %memset.pattern-expansion-main-body
+; GFX942-SDAG-NEXT:    ; =>This Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    ; Child Loop BB13_3 Depth 2
+; GFX942-SDAG-NEXT:    s_mov_b64 s[12:13], exec
+; GFX942-SDAG-NEXT:  .LBB13_3: ; Parent Loop BB13_2 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v5, s[4:7], 0 offen
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB13_3
+; GFX942-SDAG-NEXT:  ; %bb.4: ; in Loop: Header=BB13_2 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX942-SDAG-NEXT:    s_add_u32 s10, s10, 2
+; GFX942-SDAG-NEXT:    s_addc_u32 s11, s11, 0
+; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[10:11], v[10:11]
+; GFX942-SDAG-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB13_2
+; GFX942-SDAG-NEXT:  .LBB13_5: ; %Flow3
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB13_10
+; GFX942-SDAG-NEXT:  ; %bb.6: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 3, v8
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v5, -16, v5
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v4, v4, v5
+; GFX942-SDAG-NEXT:    s_mov_b64 s[10:11], 0
+; GFX942-SDAG-NEXT:  .LBB13_7: ; %memset.pattern-expansion-residual-body
+; GFX942-SDAG-NEXT:    ; =>This Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    ; Child Loop BB13_8 Depth 2
+; GFX942-SDAG-NEXT:    s_mov_b64 s[12:13], exec
+; GFX942-SDAG-NEXT:  .LBB13_8: ; Parent Loop BB13_7 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx2 v[12:13], v4, s[4:7], 0 offen
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB13_8
+; GFX942-SDAG-NEXT:  ; %bb.9: ; in Loop: Header=BB13_7 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX942-SDAG-NEXT:    s_add_u32 s10, s10, 1
+; GFX942-SDAG-NEXT:    s_addc_u32 s11, s11, 0
+; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[10:11], v[6:7]
+; GFX942-SDAG-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v4, 8, v4
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB13_7
+; GFX942-SDAG-NEXT:  .LBB13_10: ; %Flow1
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_pattern_i64_as7_dynlen_dynval:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v10, v5
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v11, v6
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v6, 1, v10
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v15, v8
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v8, vcc, v10, v6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v14, v7
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subbrev_co_u32_e32 v9, vcc, 0, v11, vcc
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB13_5
+; GFX942-GISEL-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], v[14:15]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-GISEL-NEXT:  .LBB13_2: ; %memset.pattern-expansion-main-body
+; GFX942-GISEL-NEXT:    ; =>This Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    ; Child Loop BB13_3 Depth 2
+; GFX942-GISEL-NEXT:    s_mov_b64 s[10:11], exec
+; GFX942-GISEL-NEXT:  .LBB13_3: ; Parent Loop BB13_2 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[14:17], v5, s[4:7], 0 offen
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB13_3
+; GFX942-GISEL-NEXT:  ; %bb.4: ; in Loop: Header=BB13_2 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[10:11]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 2, v12
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[12:13], v[8:9]
+; GFX942-GISEL-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB13_2
+; GFX942-GISEL-NEXT:  .LBB13_5: ; %Flow3
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v5, 1, v6
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB13_10
+; GFX942-GISEL-NEXT:  ; %bb.6: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[8:9], 1, v[10:11]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[8:9], 0
+; GFX942-GISEL-NEXT:    v_lshl_add_u32 v8, v8, 4, v4
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
+; GFX942-GISEL-NEXT:  .LBB13_7: ; %memset.pattern-expansion-residual-body
+; GFX942-GISEL-NEXT:    ; =>This Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    ; Child Loop BB13_8 Depth 2
+; GFX942-GISEL-NEXT:    s_mov_b64 s[10:11], exec
+; GFX942-GISEL-NEXT:  .LBB13_8: ; Parent Loop BB13_7 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx2 v[14:15], v8, s[4:7], 0 offen
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB13_8
+; GFX942-GISEL-NEXT:  ; %bb.9: ; in Loop: Header=BB13_7 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[10:11]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, 1, v4
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v8, 8, v8
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7]
+; GFX942-GISEL-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB13_7
+; GFX942-GISEL-NEXT:  .LBB13_10: ; %Flow1
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.experimental.memset.pattern(ptr addrspace(7) %a, i64 %val, i64 %len, i1 false)
+  ret void
+}
+
+define void @memset_pattern_i64_as7_len33_dynval(ptr addrspace(7) align 16 %a, i64 %val) {
+; GFX942-SDAG-LABEL: memset_pattern_i64_as7_len33_dynval:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, v6
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v12, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v14, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v15, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v24, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v25, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v26, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v27, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v28, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v29, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v30, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v31, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v32, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v33, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v34, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v35, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v36, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v37, v7
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-SDAG-NEXT:  .LBB14_1: ; %memset.pattern-expansion-main-body
+; GFX942-SDAG-NEXT:    ; =>This Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    ; Child Loop BB14_2 Depth 2
+; GFX942-SDAG-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX942-SDAG-NEXT:    ; Child Loop BB14_6 Depth 2
+; GFX942-SDAG-NEXT:    ; Child Loop BB14_8 Depth 2
+; GFX942-SDAG-NEXT:    ; Child Loop BB14_10 Depth 2
+; GFX942-SDAG-NEXT:    ; Child Loop BB14_12 Depth 2
+; GFX942-SDAG-NEXT:    ; Child Loop BB14_14 Depth 2
+; GFX942-SDAG-NEXT:    ; Child Loop BB14_16 Depth 2
+; GFX942-SDAG-NEXT:    ; Child Loop BB14_18 Depth 2
+; GFX942-SDAG-NEXT:    ; Child Loop BB14_20 Depth 2
+; GFX942-SDAG-NEXT:    ; Child Loop BB14_22 Depth 2
+; GFX942-SDAG-NEXT:    ; Child Loop BB14_24 Depth 2
+; GFX942-SDAG-NEXT:    ; Child Loop BB14_26 Depth 2
+; GFX942-SDAG-NEXT:    ; Child Loop BB14_28 Depth 2
+; GFX942-SDAG-NEXT:    ; Child Loop BB14_30 Depth 2
+; GFX942-SDAG-NEXT:    ; Child Loop BB14_32 Depth 2
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-SDAG-NEXT:  .LBB14_2: ; Parent Loop BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[6:9], v5, s[4:7], 0 offen
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_2
+; GFX942-SDAG-NEXT:  ; %bb.3: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-SDAG-NEXT:  .LBB14_4: ; Parent Loop BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[10:13], v5, s[4:7], 0 offen offset:16
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_4
+; GFX942-SDAG-NEXT:  ; %bb.5: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-SDAG-NEXT:  .LBB14_6: ; Parent Loop BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[14:17], v5, s[4:7], 0 offen offset:32
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_6
+; GFX942-SDAG-NEXT:  ; %bb.7: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-SDAG-NEXT:  .LBB14_8: ; Parent Loop BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[18:21], v5, s[4:7], 0 offen offset:48
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_8
+; GFX942-SDAG-NEXT:  ; %bb.9: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-SDAG-NEXT:  .LBB14_10: ; Parent Loop BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[22:25], v5, s[4:7], 0 offen offset:64
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_10
+; GFX942-SDAG-NEXT:  ; %bb.11: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-SDAG-NEXT:  .LBB14_12: ; Parent Loop BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[26:29], v5, s[4:7], 0 offen offset:80
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_12
+; GFX942-SDAG-NEXT:  ; %bb.13: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-SDAG-NEXT:  .LBB14_14: ; Parent Loop BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[30:33], v5, s[4:7], 0 offen offset:96
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_14
+; GFX942-SDAG-NEXT:  ; %bb.15: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-SDAG-NEXT:  .LBB14_16: ; Parent Loop BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[34:37], v5, s[4:7], 0 offen offset:112
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_16
+; GFX942-SDAG-NEXT:  ; %bb.17: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-SDAG-NEXT:  .LBB14_18: ; Parent Loop BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[6:9], v5, s[4:7], 0 offen offset:128
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_18
+; GFX942-SDAG-NEXT:  ; %bb.19: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-SDAG-NEXT:  .LBB14_20: ; Parent Loop BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[10:13], v5, s[4:7], 0 offen offset:144
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_20
+; GFX942-SDAG-NEXT:  ; %bb.21: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-SDAG-NEXT:  .LBB14_22: ; Parent Loop BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[14:17], v5, s[4:7], 0 offen offset:160
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_22
+; GFX942-SDAG-NEXT:  ; %bb.23: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-SDAG-NEXT:  .LBB14_24: ; Parent Loop BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[18:21], v5, s[4:7], 0 offen offset:176
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_24
+; GFX942-SDAG-NEXT:  ; %bb.25: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-SDAG-NEXT:  .LBB14_26: ; Parent Loop BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[22:25], v5, s[4:7], 0 offen offset:192
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_26
+; GFX942-SDAG-NEXT:  ; %bb.27: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-SDAG-NEXT:  .LBB14_28: ; Parent Loop BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[26:29], v5, s[4:7], 0 offen offset:208
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_28
+; GFX942-SDAG-NEXT:  ; %bb.29: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-SDAG-NEXT:  .LBB14_30: ; Parent Loop BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[30:33], v5, s[4:7], 0 offen offset:224
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_30
+; GFX942-SDAG-NEXT:  ; %bb.31: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-SDAG-NEXT:  .LBB14_32: ; Parent Loop BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[34:37], v5, s[4:7], 0 offen offset:240
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_32
+; GFX942-SDAG-NEXT:  ; %bb.33: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_add_u32 s2, s2, 32
+; GFX942-SDAG-NEXT:    s_addc_u32 s3, s3, 0
+; GFX942-SDAG-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v5, 0x2000, v5
+; GFX942-SDAG-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_vccnz .LBB14_1
+; GFX942-SDAG-NEXT:  ; %bb.34: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-SDAG-NEXT:  .LBB14_35: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx2 v[6:7], v4, s[4:7], 0 offen offset:256
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr4
+; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_35
+; GFX942-SDAG-NEXT:  ; %bb.36:
+; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_pattern_i64_as7_len33_dynval:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, v5
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v9, v6
+; GFX942-GISEL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-GISEL-NEXT:  .LBB14_1: ; %memset.pattern-expansion-main-body
+; GFX942-GISEL-NEXT:    ; =>This Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    ; Child Loop BB14_2 Depth 2
+; GFX942-GISEL-NEXT:    ; Child Loop BB14_4 Depth 2
+; GFX942-GISEL-NEXT:    ; Child Loop BB14_6 Depth 2
+; GFX942-GISEL-NEXT:    ; Child Loop BB14_8 Depth 2
+; GFX942-GISEL-NEXT:    ; Child Loop BB14_10 Depth 2
+; GFX942-GISEL-NEXT:    ; Child Loop BB14_12 Depth 2
+; GFX942-GISEL-NEXT:    ; Child Loop BB14_14 Depth 2
+; GFX942-GISEL-NEXT:    ; Child Loop BB14_16 Depth 2
+; GFX942-GISEL-NEXT:    ; Child Loop BB14_18 Depth 2
+; GFX942-GISEL-NEXT:    ; Child Loop BB14_20 Depth 2
+; GFX942-GISEL-NEXT:    ; Child Loop BB14_22 Depth 2
+; GFX942-GISEL-NEXT:    ; Child Loop BB14_24 Depth 2
+; GFX942-GISEL-NEXT:    ; Child Loop BB14_26 Depth 2
+; GFX942-GISEL-NEXT:    ; Child Loop BB14_28 Depth 2
+; GFX942-GISEL-NEXT:    ; Child Loop BB14_30 Depth 2
+; GFX942-GISEL-NEXT:    ; Child Loop BB14_32 Depth 2
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-GISEL-NEXT:  .LBB14_2: ; Parent Loop BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_2
+; GFX942-GISEL-NEXT:  ; %bb.3: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-GISEL-NEXT:  .LBB14_4: ; Parent Loop BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:16
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_4
+; GFX942-GISEL-NEXT:  ; %bb.5: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-GISEL-NEXT:  .LBB14_6: ; Parent Loop BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:32
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_6
+; GFX942-GISEL-NEXT:  ; %bb.7: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-GISEL-NEXT:  .LBB14_8: ; Parent Loop BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:48
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_8
+; GFX942-GISEL-NEXT:  ; %bb.9: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-GISEL-NEXT:  .LBB14_10: ; Parent Loop BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:64
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_10
+; GFX942-GISEL-NEXT:  ; %bb.11: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-GISEL-NEXT:  .LBB14_12: ; Parent Loop BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:80
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_12
+; GFX942-GISEL-NEXT:  ; %bb.13: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-GISEL-NEXT:  .LBB14_14: ; Parent Loop BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:96
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_14
+; GFX942-GISEL-NEXT:  ; %bb.15: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-GISEL-NEXT:  .LBB14_16: ; Parent Loop BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:112
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_16
+; GFX942-GISEL-NEXT:  ; %bb.17: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-GISEL-NEXT:  .LBB14_18: ; Parent Loop BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:128
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_18
+; GFX942-GISEL-NEXT:  ; %bb.19: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-GISEL-NEXT:  .LBB14_20: ; Parent Loop BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:144
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_20
+; GFX942-GISEL-NEXT:  ; %bb.21: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-GISEL-NEXT:  .LBB14_22: ; Parent Loop BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:160
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_22
+; GFX942-GISEL-NEXT:  ; %bb.23: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-GISEL-NEXT:  .LBB14_24: ; Parent Loop BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:176
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_24
+; GFX942-GISEL-NEXT:  ; %bb.25: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-GISEL-NEXT:  .LBB14_26: ; Parent Loop BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:192
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_26
+; GFX942-GISEL-NEXT:  ; %bb.27: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-GISEL-NEXT:  .LBB14_28: ; Parent Loop BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:208
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_28
+; GFX942-GISEL-NEXT:  ; %bb.29: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-GISEL-NEXT:  .LBB14_30: ; Parent Loop BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:224
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_30
+; GFX942-GISEL-NEXT:  ; %bb.31: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-GISEL-NEXT:  .LBB14_32: ; Parent Loop BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:240
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_32
+; GFX942-GISEL-NEXT:  ; %bb.33: ; in Loop: Header=BB14_1 Depth=1
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 32, v6
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v5, 0x2000, v5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; GFX942-GISEL-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_vccnz .LBB14_1
+; GFX942-GISEL-NEXT:  ; %bb.34: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-GISEL-NEXT:  .LBB14_35: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    buffer_store_dwordx2 v[8:9], v4, s[4:7], 0 offen offset:256
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr0
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr4
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_35
+; GFX942-GISEL-NEXT:  ; %bb.36:
+; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.experimental.memset.pattern(ptr addrspace(7) %a, i64 %val, i64 33, i1 false)
+  ret void
+}

>From 7ee4f09d04771d121011d8d97108055b1ad9c349 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Thu, 12 Mar 2026 13:25:19 -0400
Subject: [PATCH 3/3] Use inreg for AS7 pointer arguments to make them
 wave-uniform

---
 llvm/test/CodeGen/AMDGPU/memset-pattern.ll | 1295 +++++---------------
 1 file changed, 292 insertions(+), 1003 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/memset-pattern.ll b/llvm/test/CodeGen/AMDGPU/memset-pattern.ll
index cec1b86d4ad94..750ba2f3afa1f 100644
--- a/llvm/test/CodeGen/AMDGPU/memset-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/memset-pattern.ll
@@ -743,1069 +743,358 @@ define void @memset_pattern_i96_dynval_dynlen(ptr addrspace(1) align 16 %a, i96
   ret void
 }
 
-define void @memset_pattern_i64_as7_dynlen(ptr addrspace(7) align 16 %a, i64 %len) {
+define void @memset_pattern_i64_as7_dynlen(ptr addrspace(7) inreg align 16 %a, i32 %offset, i64 %len) {
 ; GFX942-SDAG-LABEL: memset_pattern_i64_as7_dynlen:
 ; GFX942-SDAG:       ; %bb.0:
 ; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v12, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v15, v6
-; GFX942-SDAG-NEXT:    v_and_b32_e32 v14, -2, v12
-; GFX942-SDAG-NEXT:    v_and_b32_e32 v10, 1, v12
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, 0
-; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0
-; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB12_5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, v2
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v10, -2, v8
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v6, 1, v8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB12_3
 ; GFX942-SDAG-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, 0xccccdddd
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, 0xaaaabbbb
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v6
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, v7
-; GFX942-SDAG-NEXT:    s_mov_b64 s[10:11], 0
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0xccccdddd
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0xaaaabbbb
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v1, s16, v0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX942-SDAG-NEXT:  .LBB12_2: ; %memset.pattern-expansion-main-body
-; GFX942-SDAG-NEXT:    ; =>This Loop Header: Depth=1
-; GFX942-SDAG-NEXT:    ; Child Loop BB12_3 Depth 2
-; GFX942-SDAG-NEXT:    s_mov_b64 s[12:13], exec
-; GFX942-SDAG-NEXT:  .LBB12_3: ; Parent Loop BB12_2 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[6:9], v5, s[4:7], 0 offen
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB12_3
-; GFX942-SDAG-NEXT:  ; %bb.4: ; in Loop: Header=BB12_2 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[12:13]
-; GFX942-SDAG-NEXT:    s_add_u32 s10, s10, 2
-; GFX942-SDAG-NEXT:    s_addc_u32 s11, s11, 0
-; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[10:11], v[14:15]
-; GFX942-SDAG-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-SDAG-NEXT:    v_add_u32_e32 v5, 32, v5
-; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    s_add_u32 s8, s8, 2
+; GFX942-SDAG-NEXT:    s_addc_u32 s9, s9, 0
+; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[8:9], v[10:11]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[2:5], v1, s[0:3], 0 offen
+; GFX942-SDAG-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX942-SDAG-NEXT:  .LBB12_5: ; %Flow3
-; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0
-; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB12_10
-; GFX942-SDAG-NEXT:  ; %bb.6: ; %memset.pattern-expansion-residual-body.preheader
-; GFX942-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 3, v12
-; GFX942-SDAG-NEXT:    v_and_b32_e32 v5, -16, v5
-; GFX942-SDAG-NEXT:    v_add_u32_e32 v6, v4, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 0xccccdddd
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 0xaaaabbbb
-; GFX942-SDAG-NEXT:    s_mov_b64 s[10:11], 0
-; GFX942-SDAG-NEXT:  .LBB12_7: ; %memset.pattern-expansion-residual-body
-; GFX942-SDAG-NEXT:    ; =>This Loop Header: Depth=1
-; GFX942-SDAG-NEXT:    ; Child Loop BB12_8 Depth 2
-; GFX942-SDAG-NEXT:    s_mov_b64 s[12:13], exec
-; GFX942-SDAG-NEXT:  .LBB12_8: ; Parent Loop BB12_7 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx2 v[4:5], v6, s[4:7], 0 offen
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB12_8
-; GFX942-SDAG-NEXT:  ; %bb.9: ; in Loop: Header=BB12_7 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[12:13]
-; GFX942-SDAG-NEXT:    s_add_u32 s10, s10, 1
-; GFX942-SDAG-NEXT:    s_addc_u32 s11, s11, 0
-; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[10:11], v[10:11]
-; GFX942-SDAG-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-SDAG-NEXT:    v_add_u32_e32 v6, 8, v6
-; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB12_7
-; GFX942-SDAG-NEXT:  .LBB12_10: ; %Flow1
-; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-SDAG-NEXT:  .LBB12_3: ; %Flow3
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB12_6
+; GFX942-SDAG-NEXT:  ; %bb.4: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 3, v8
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX942-SDAG-NEXT:    v_add3_u32 v2, v0, v1, s16
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0xccccdddd
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 0xaaaabbbb
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], 0
+; GFX942-SDAG-NEXT:  .LBB12_5: ; %memset.pattern-expansion-residual-body
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    s_add_u32 s8, s8, 1
+; GFX942-SDAG-NEXT:    s_addc_u32 s9, s9, 0
+; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[8:9], v[6:7]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; GFX942-SDAG-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v2, 8, v2
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB12_5
+; GFX942-SDAG-NEXT:  .LBB12_6: ; %Flow1
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-GISEL-LABEL: memset_pattern_i64_as7_dynlen:
 ; GFX942-GISEL:       ; %bb.0:
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v12, v5
-; GFX942-GISEL-NEXT:    v_and_b32_e32 v10, 1, v12
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v13, v6
-; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v14, vcc, v12, v10
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v11, 0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, v1
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v6, 1, v8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v9, v2
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v10, vcc, v8, v6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_subbrev_co_u32_e32 v15, vcc, 0, v13, vcc
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], 0
-; GFX942-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB12_5
+; GFX942-GISEL-NEXT:    v_subbrev_co_u32_e32 v11, vcc, 0, v9, vcc
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB12_3
 ; GFX942-GISEL-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
-; GFX942-GISEL-NEXT:    s_mov_b32 s4, 0xccccdddd
-; GFX942-GISEL-NEXT:    s_mov_b32 s5, 0xaaaabbbb
-; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
-; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
-; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[2:3]
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-GISEL-NEXT:    s_mov_b32 s8, 0xccccdddd
+; GFX942-GISEL-NEXT:    s_mov_b32 s9, 0xaaaabbbb
+; GFX942-GISEL-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[8:9]
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v1, s16, v0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[4:5]
 ; GFX942-GISEL-NEXT:  .LBB12_2: ; %memset.pattern-expansion-main-body
-; GFX942-GISEL-NEXT:    ; =>This Loop Header: Depth=1
-; GFX942-GISEL-NEXT:    ; Child Loop BB12_3 Depth 2
-; GFX942-GISEL-NEXT:    s_mov_b64 s[10:11], exec
-; GFX942-GISEL-NEXT:  .LBB12_3: ; Parent Loop BB12_2 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[6:9], v5, s[4:7], 0 offen
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB12_3
-; GFX942-GISEL-NEXT:  ; %bb.4: ; in Loop: Header=BB12_2 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[10:11]
-; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v16, vcc, 2, v16
-; GFX942-GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 2, v12
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[2:5], v1, s[0:3], 0 offen
 ; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v17, vcc, 0, v17, vcc
-; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[16:17], v[14:15]
-; GFX942-GISEL-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[12:13], v[10:11]
+; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB12_2
-; GFX942-GISEL-NEXT:  .LBB12_5: ; %Flow3
-; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX942-GISEL-NEXT:    v_and_b32_e32 v5, 1, v10
-; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB12_10
-; GFX942-GISEL-NEXT:  ; %bb.6: ; %memset.pattern-expansion-residual-body.preheader
-; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[6:7], 1, v[12:13]
-; GFX942-GISEL-NEXT:    v_lshl_add_u32 v8, v6, 4, v4
-; GFX942-GISEL-NEXT:    s_mov_b64 s[8:9], 0
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0xccccdddd
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, 0xaaaabbbb
-; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[8:9]
-; GFX942-GISEL-NEXT:  .LBB12_7: ; %memset.pattern-expansion-residual-body
-; GFX942-GISEL-NEXT:    ; =>This Loop Header: Depth=1
-; GFX942-GISEL-NEXT:    ; Child Loop BB12_8 Depth 2
-; GFX942-GISEL-NEXT:    s_mov_b64 s[10:11], exec
-; GFX942-GISEL-NEXT:  .LBB12_8: ; Parent Loop BB12_7 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx2 v[4:5], v8, s[4:7], 0 offen
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB12_8
-; GFX942-GISEL-NEXT:  ; %bb.9: ; in Loop: Header=BB12_7 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[10:11]
-; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v6
-; GFX942-GISEL-NEXT:    v_add_u32_e32 v8, 8, v8
+; GFX942-GISEL-NEXT:  .LBB12_3: ; %Flow3
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v1, 1, v6
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB12_6
+; GFX942-GISEL-NEXT:  ; %bb.4: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[2:3], 1, v[8:9]
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 4, v2
+; GFX942-GISEL-NEXT:    v_add3_u32 v4, v0, v1, s16
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], 0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0xccccdddd
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0xaaaabbbb
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-GISEL-NEXT:  .LBB12_5: ; %memset.pattern-expansion-residual-body
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v2, vcc, 1, v2
+; GFX942-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 offen
 ; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[10:11]
-; GFX942-GISEL-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB12_7
-; GFX942-GISEL-NEXT:  .LBB12_10: ; %Flow1
-; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[2:3], v[6:7]
+; GFX942-GISEL-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v4, 8, v4
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB12_5
+; GFX942-GISEL-NEXT:  .LBB12_6: ; %Flow1
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  call void @llvm.experimental.memset.pattern(ptr addrspace(7) %a, i64 u0xaaaabbbbccccdddd, i64 %len, i1 false)
+  %p = getelementptr inbounds i8, ptr addrspace(7) %a, i32 %offset
+  call void @llvm.experimental.memset.pattern(ptr addrspace(7) %p, i64 u0xaaaabbbbccccdddd, i64 %len, i1 false)
   ret void
 }
 
-define void @memset_pattern_i64_as7_dynlen_dynval(ptr addrspace(7) align 16 %a, i64 %len, i64 %val) {
+define void @memset_pattern_i64_as7_dynlen_dynval(ptr addrspace(7) inreg align 16 %a, i32 %offset, i64 %len, i64 %val) {
 ; GFX942-SDAG-LABEL: memset_pattern_i64_as7_dynlen_dynval:
 ; GFX942-SDAG:       ; %bb.0:
 ; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v13, v8
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, v6
-; GFX942-SDAG-NEXT:    v_and_b32_e32 v10, -2, v8
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v12, v7
-; GFX942-SDAG-NEXT:    v_and_b32_e32 v6, 1, v8
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, 0
-; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0
-; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB13_5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, v4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, v2
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v6, -2, v4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v3
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v2, 1, v4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB13_3
 ; GFX942-SDAG-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v14, v12
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v15, v13
-; GFX942-SDAG-NEXT:    s_mov_b64 s[10:11], 0
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v1, s16, v0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, v8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, v9
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX942-SDAG-NEXT:  .LBB13_2: ; %memset.pattern-expansion-main-body
-; GFX942-SDAG-NEXT:    ; =>This Loop Header: Depth=1
-; GFX942-SDAG-NEXT:    ; Child Loop BB13_3 Depth 2
-; GFX942-SDAG-NEXT:    s_mov_b64 s[12:13], exec
-; GFX942-SDAG-NEXT:  .LBB13_3: ; Parent Loop BB13_2 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[12:15], v5, s[4:7], 0 offen
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB13_3
-; GFX942-SDAG-NEXT:  ; %bb.4: ; in Loop: Header=BB13_2 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[12:13]
-; GFX942-SDAG-NEXT:    s_add_u32 s10, s10, 2
-; GFX942-SDAG-NEXT:    s_addc_u32 s11, s11, 0
-; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[10:11], v[10:11]
-; GFX942-SDAG-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-SDAG-NEXT:    v_add_u32_e32 v5, 32, v5
-; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    s_add_u32 s8, s8, 2
+; GFX942-SDAG-NEXT:    s_addc_u32 s9, s9, 0
+; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[8:9], v[6:7]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[8:11], v1, s[0:3], 0 offen
+; GFX942-SDAG-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX942-SDAG-NEXT:  .LBB13_5: ; %Flow3
-; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0
-; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB13_10
-; GFX942-SDAG-NEXT:  ; %bb.6: ; %memset.pattern-expansion-residual-body.preheader
-; GFX942-SDAG-NEXT:    v_lshlrev_b32_e32 v5, 3, v8
-; GFX942-SDAG-NEXT:    v_and_b32_e32 v5, -16, v5
-; GFX942-SDAG-NEXT:    v_add_u32_e32 v4, v4, v5
-; GFX942-SDAG-NEXT:    s_mov_b64 s[10:11], 0
-; GFX942-SDAG-NEXT:  .LBB13_7: ; %memset.pattern-expansion-residual-body
-; GFX942-SDAG-NEXT:    ; =>This Loop Header: Depth=1
-; GFX942-SDAG-NEXT:    ; Child Loop BB13_8 Depth 2
-; GFX942-SDAG-NEXT:    s_mov_b64 s[12:13], exec
-; GFX942-SDAG-NEXT:  .LBB13_8: ; Parent Loop BB13_7 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx2 v[12:13], v4, s[4:7], 0 offen
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB13_8
-; GFX942-SDAG-NEXT:  ; %bb.9: ; in Loop: Header=BB13_7 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[12:13]
-; GFX942-SDAG-NEXT:    s_add_u32 s10, s10, 1
-; GFX942-SDAG-NEXT:    s_addc_u32 s11, s11, 0
-; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[10:11], v[6:7]
-; GFX942-SDAG-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-SDAG-NEXT:    v_add_u32_e32 v4, 8, v4
-; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB13_7
-; GFX942-SDAG-NEXT:  .LBB13_10: ; %Flow1
-; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-SDAG-NEXT:  .LBB13_3: ; %Flow3
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB13_6
+; GFX942-SDAG-NEXT:  ; %bb.4: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 3, v4
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v1, -16, v1
+; GFX942-SDAG-NEXT:    v_add3_u32 v0, v0, v1, s16
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], 0
+; GFX942-SDAG-NEXT:  .LBB13_5: ; %memset.pattern-expansion-residual-body
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    s_add_u32 s8, s8, 1
+; GFX942-SDAG-NEXT:    s_addc_u32 s9, s9, 0
+; GFX942-SDAG-NEXT:    v_cmp_ge_u64_e32 vcc, s[8:9], v[2:3]
+; GFX942-SDAG-NEXT:    buffer_store_dwordx2 v[8:9], v0, s[0:3], 0 offen
+; GFX942-SDAG-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v0, 8, v0
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB13_5
+; GFX942-SDAG-NEXT:  .LBB13_6: ; %Flow1
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-GISEL-LABEL: memset_pattern_i64_as7_dynlen_dynval:
 ; GFX942-GISEL:       ; %bb.0:
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v10, v5
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v11, v6
-; GFX942-GISEL-NEXT:    v_and_b32_e32 v6, 1, v10
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v15, v8
-; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v8, vcc, v10, v6
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v14, v7
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, v1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, v2
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v2, 1, v6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v11, v4
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v4, vcc, v6, v2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v10, v3
 ; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_subbrev_co_u32_e32 v9, vcc, 0, v11, vcc
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, 0
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], 0
-; GFX942-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB13_5
+; GFX942-GISEL-NEXT:    v_subbrev_co_u32_e32 v5, vcc, 0, v7, vcc
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB13_3
 ; GFX942-GISEL-NEXT:  ; %bb.1: ; %memset.pattern-expansion-main-body.preheader
-; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], v[14:15]
-; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[2:3]
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v1, s16, v0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[10:11]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[4:5]
 ; GFX942-GISEL-NEXT:  .LBB13_2: ; %memset.pattern-expansion-main-body
-; GFX942-GISEL-NEXT:    ; =>This Loop Header: Depth=1
-; GFX942-GISEL-NEXT:    ; Child Loop BB13_3 Depth 2
-; GFX942-GISEL-NEXT:    s_mov_b64 s[10:11], exec
-; GFX942-GISEL-NEXT:  .LBB13_3: ; Parent Loop BB13_2 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[14:17], v5, s[4:7], 0 offen
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB13_3
-; GFX942-GISEL-NEXT:  ; %bb.4: ; in Loop: Header=BB13_2 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[10:11]
-; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 2, v12
-; GFX942-GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, 2, v8
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[10:13], v1, s[0:3], 0 offen
 ; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
-; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[12:13], v[8:9]
-; GFX942-GISEL-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[8:9], v[4:5]
+; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB13_2
-; GFX942-GISEL-NEXT:  .LBB13_5: ; %Flow3
-; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GFX942-GISEL-NEXT:    v_and_b32_e32 v5, 1, v6
-; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB13_10
-; GFX942-GISEL-NEXT:  ; %bb.6: ; %memset.pattern-expansion-residual-body.preheader
-; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[8:9], 1, v[10:11]
-; GFX942-GISEL-NEXT:    s_mov_b64 s[8:9], 0
-; GFX942-GISEL-NEXT:    v_lshl_add_u32 v8, v8, 4, v4
-; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[8:9]
-; GFX942-GISEL-NEXT:  .LBB13_7: ; %memset.pattern-expansion-residual-body
-; GFX942-GISEL-NEXT:    ; =>This Loop Header: Depth=1
-; GFX942-GISEL-NEXT:    ; Child Loop BB13_8 Depth 2
-; GFX942-GISEL-NEXT:    s_mov_b64 s[10:11], exec
-; GFX942-GISEL-NEXT:  .LBB13_8: ; Parent Loop BB13_7 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx2 v[14:15], v8, s[4:7], 0 offen
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB13_8
-; GFX942-GISEL-NEXT:  ; %bb.9: ; in Loop: Header=BB13_7 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[10:11]
-; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, 1, v4
-; GFX942-GISEL-NEXT:    v_add_u32_e32 v8, 8, v8
+; GFX942-GISEL-NEXT:  .LBB13_3: ; %Flow3
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v1, 1, v2
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB13_6
+; GFX942-GISEL-NEXT:  ; %bb.4: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[4:5], 1, v[6:7]
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 4, v4
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], 0
+; GFX942-GISEL-NEXT:    v_add3_u32 v4, v0, v1, s16
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-GISEL-NEXT:  .LBB13_5: ; %memset.pattern-expansion-residual-body
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
+; GFX942-GISEL-NEXT:    buffer_store_dwordx2 v[10:11], v4, s[0:3], 0 offen
 ; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[4:5], v[6:7]
-; GFX942-GISEL-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
-; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB13_7
-; GFX942-GISEL-NEXT:  .LBB13_10: ; %Flow1
-; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3]
+; GFX942-GISEL-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v4, 8, v4
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB13_5
+; GFX942-GISEL-NEXT:  .LBB13_6: ; %Flow1
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  call void @llvm.experimental.memset.pattern(ptr addrspace(7) %a, i64 %val, i64 %len, i1 false)
+  %p = getelementptr inbounds i8, ptr addrspace(7) %a, i32 %offset
+  call void @llvm.experimental.memset.pattern(ptr addrspace(7) %p, i64 %val, i64 %len, i1 false)
   ret void
 }
 
-define void @memset_pattern_i64_as7_len33_dynval(ptr addrspace(7) align 16 %a, i64 %val) {
+define void @memset_pattern_i64_as7_len33_dynval(ptr addrspace(7) inreg align 16 %a, i32 %offset, i64 %val) {
 ; GFX942-SDAG-LABEL: memset_pattern_i64_as7_len33_dynval:
 ; GFX942-SDAG:       ; %bb.0:
 ; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, v6
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, v7
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, v7
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v12, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v13, v7
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v14, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v15, v7
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, v7
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, v7
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, v7
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, v7
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v24, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v26, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v27, v7
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v28, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v29, v7
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v30, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v31, v7
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v32, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v33, v7
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v34, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v35, v7
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v36, v5
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v37, v7
-; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0
-; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v0, s16, v0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, v3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v12, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v13, v3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v14, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v15, v3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, v3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v20, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, v3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v22, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v23, v3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v24, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v25, v3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v26, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v27, v3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v28, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v29, v3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v30, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v31, v3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v32, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v33, v3
+; GFX942-SDAG-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX942-SDAG-NEXT:  .LBB14_1: ; %memset.pattern-expansion-main-body
-; GFX942-SDAG-NEXT:    ; =>This Loop Header: Depth=1
-; GFX942-SDAG-NEXT:    ; Child Loop BB14_2 Depth 2
-; GFX942-SDAG-NEXT:    ; Child Loop BB14_4 Depth 2
-; GFX942-SDAG-NEXT:    ; Child Loop BB14_6 Depth 2
-; GFX942-SDAG-NEXT:    ; Child Loop BB14_8 Depth 2
-; GFX942-SDAG-NEXT:    ; Child Loop BB14_10 Depth 2
-; GFX942-SDAG-NEXT:    ; Child Loop BB14_12 Depth 2
-; GFX942-SDAG-NEXT:    ; Child Loop BB14_14 Depth 2
-; GFX942-SDAG-NEXT:    ; Child Loop BB14_16 Depth 2
-; GFX942-SDAG-NEXT:    ; Child Loop BB14_18 Depth 2
-; GFX942-SDAG-NEXT:    ; Child Loop BB14_20 Depth 2
-; GFX942-SDAG-NEXT:    ; Child Loop BB14_22 Depth 2
-; GFX942-SDAG-NEXT:    ; Child Loop BB14_24 Depth 2
-; GFX942-SDAG-NEXT:    ; Child Loop BB14_26 Depth 2
-; GFX942-SDAG-NEXT:    ; Child Loop BB14_28 Depth 2
-; GFX942-SDAG-NEXT:    ; Child Loop BB14_30 Depth 2
-; GFX942-SDAG-NEXT:    ; Child Loop BB14_32 Depth 2
-; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-SDAG-NEXT:  .LBB14_2: ; Parent Loop BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[6:9], v5, s[4:7], 0 offen
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_2
-; GFX942-SDAG-NEXT:  ; %bb.3: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-SDAG-NEXT:  .LBB14_4: ; Parent Loop BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[10:13], v5, s[4:7], 0 offen offset:16
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX942-SDAG-NEXT:  ; %bb.5: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-SDAG-NEXT:  .LBB14_6: ; Parent Loop BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[14:17], v5, s[4:7], 0 offen offset:32
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_6
-; GFX942-SDAG-NEXT:  ; %bb.7: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-SDAG-NEXT:  .LBB14_8: ; Parent Loop BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[18:21], v5, s[4:7], 0 offen offset:48
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_8
-; GFX942-SDAG-NEXT:  ; %bb.9: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-SDAG-NEXT:  .LBB14_10: ; Parent Loop BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[22:25], v5, s[4:7], 0 offen offset:64
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_10
-; GFX942-SDAG-NEXT:  ; %bb.11: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-SDAG-NEXT:  .LBB14_12: ; Parent Loop BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[26:29], v5, s[4:7], 0 offen offset:80
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_12
-; GFX942-SDAG-NEXT:  ; %bb.13: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-SDAG-NEXT:  .LBB14_14: ; Parent Loop BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[30:33], v5, s[4:7], 0 offen offset:96
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_14
-; GFX942-SDAG-NEXT:  ; %bb.15: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-SDAG-NEXT:  .LBB14_16: ; Parent Loop BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[34:37], v5, s[4:7], 0 offen offset:112
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_16
-; GFX942-SDAG-NEXT:  ; %bb.17: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-SDAG-NEXT:  .LBB14_18: ; Parent Loop BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[6:9], v5, s[4:7], 0 offen offset:128
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_18
-; GFX942-SDAG-NEXT:  ; %bb.19: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-SDAG-NEXT:  .LBB14_20: ; Parent Loop BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[10:13], v5, s[4:7], 0 offen offset:144
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_20
-; GFX942-SDAG-NEXT:  ; %bb.21: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-SDAG-NEXT:  .LBB14_22: ; Parent Loop BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[14:17], v5, s[4:7], 0 offen offset:160
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_22
-; GFX942-SDAG-NEXT:  ; %bb.23: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-SDAG-NEXT:  .LBB14_24: ; Parent Loop BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[18:21], v5, s[4:7], 0 offen offset:176
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_24
-; GFX942-SDAG-NEXT:  ; %bb.25: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-SDAG-NEXT:  .LBB14_26: ; Parent Loop BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[22:25], v5, s[4:7], 0 offen offset:192
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_26
-; GFX942-SDAG-NEXT:  ; %bb.27: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-SDAG-NEXT:  .LBB14_28: ; Parent Loop BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[26:29], v5, s[4:7], 0 offen offset:208
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_28
-; GFX942-SDAG-NEXT:  ; %bb.29: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-SDAG-NEXT:  .LBB14_30: ; Parent Loop BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[30:33], v5, s[4:7], 0 offen offset:224
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_30
-; GFX942-SDAG-NEXT:  ; %bb.31: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], exec
-; GFX942-SDAG-NEXT:  .LBB14_32: ; Parent Loop BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[34:37], v5, s[4:7], 0 offen offset:240
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_32
-; GFX942-SDAG-NEXT:  ; %bb.33: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX942-SDAG-NEXT:    s_add_u32 s2, s2, 32
-; GFX942-SDAG-NEXT:    s_addc_u32 s3, s3, 0
-; GFX942-SDAG-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX942-SDAG-NEXT:    v_add_u32_e32 v5, 0x2000, v5
-; GFX942-SDAG-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    s_add_u32 s4, s4, 32
+; GFX942-SDAG-NEXT:    s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[2:5], v1, s[0:3], 0 offen
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[6:9], v1, s[0:3], 0 offen offset:16
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[10:13], v1, s[0:3], 0 offen offset:32
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[14:17], v1, s[0:3], 0 offen offset:48
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[18:21], v1, s[0:3], 0 offen offset:64
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[22:25], v1, s[0:3], 0 offen offset:80
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[26:29], v1, s[0:3], 0 offen offset:96
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[30:33], v1, s[0:3], 0 offen offset:112
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[2:5], v1, s[0:3], 0 offen offset:128
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[6:9], v1, s[0:3], 0 offen offset:144
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[10:13], v1, s[0:3], 0 offen offset:160
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[14:17], v1, s[0:3], 0 offen offset:176
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[18:21], v1, s[0:3], 0 offen offset:192
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[22:25], v1, s[0:3], 0 offen offset:208
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[26:29], v1, s[0:3], 0 offen offset:224
+; GFX942-SDAG-NEXT:    buffer_store_dwordx4 v[30:33], v1, s[0:3], 0 offen offset:240
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v1, 0x2000, v1
+; GFX942-SDAG-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; GFX942-SDAG-NEXT:    s_cbranch_vccnz .LBB14_1
-; GFX942-SDAG-NEXT:  ; %bb.34: ; %memset.pattern-expansion-residual-body.preheader
-; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-SDAG-NEXT:  .LBB14_35: ; =>This Inner Loop Header: Depth=1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-SDAG-NEXT:    s_nop 0
-; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-SDAG-NEXT:    buffer_store_dwordx2 v[6:7], v4, s[4:7], 0 offen offset:256
-; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
-; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr4
-; GFX942-SDAG-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB14_35
-; GFX942-SDAG-NEXT:  ; %bb.36:
-; GFX942-SDAG-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-SDAG-NEXT:  ; %bb.2: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-SDAG-NEXT:    buffer_store_dwordx2 v[2:3], v0, s[0:3], 0 offen offset:256
 ; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX942-GISEL-LABEL: memset_pattern_i64_as7_len33_dynval:
 ; GFX942-GISEL:       ; %bb.0:
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, v5
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v9, v6
-; GFX942-GISEL-NEXT:    s_mov_b64 s[0:1], 0
-; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[8:9]
-; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
-; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, v1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v1, s16, v0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
 ; GFX942-GISEL-NEXT:  .LBB14_1: ; %memset.pattern-expansion-main-body
-; GFX942-GISEL-NEXT:    ; =>This Loop Header: Depth=1
-; GFX942-GISEL-NEXT:    ; Child Loop BB14_2 Depth 2
-; GFX942-GISEL-NEXT:    ; Child Loop BB14_4 Depth 2
-; GFX942-GISEL-NEXT:    ; Child Loop BB14_6 Depth 2
-; GFX942-GISEL-NEXT:    ; Child Loop BB14_8 Depth 2
-; GFX942-GISEL-NEXT:    ; Child Loop BB14_10 Depth 2
-; GFX942-GISEL-NEXT:    ; Child Loop BB14_12 Depth 2
-; GFX942-GISEL-NEXT:    ; Child Loop BB14_14 Depth 2
-; GFX942-GISEL-NEXT:    ; Child Loop BB14_16 Depth 2
-; GFX942-GISEL-NEXT:    ; Child Loop BB14_18 Depth 2
-; GFX942-GISEL-NEXT:    ; Child Loop BB14_20 Depth 2
-; GFX942-GISEL-NEXT:    ; Child Loop BB14_22 Depth 2
-; GFX942-GISEL-NEXT:    ; Child Loop BB14_24 Depth 2
-; GFX942-GISEL-NEXT:    ; Child Loop BB14_26 Depth 2
-; GFX942-GISEL-NEXT:    ; Child Loop BB14_28 Depth 2
-; GFX942-GISEL-NEXT:    ; Child Loop BB14_30 Depth 2
-; GFX942-GISEL-NEXT:    ; Child Loop BB14_32 Depth 2
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-GISEL-NEXT:  .LBB14_2: ; Parent Loop BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_2
-; GFX942-GISEL-NEXT:  ; %bb.3: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-GISEL-NEXT:  .LBB14_4: ; Parent Loop BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:16
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_4
-; GFX942-GISEL-NEXT:  ; %bb.5: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-GISEL-NEXT:  .LBB14_6: ; Parent Loop BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:32
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_6
-; GFX942-GISEL-NEXT:  ; %bb.7: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-GISEL-NEXT:  .LBB14_8: ; Parent Loop BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:48
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_8
-; GFX942-GISEL-NEXT:  ; %bb.9: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-GISEL-NEXT:  .LBB14_10: ; Parent Loop BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:64
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_10
-; GFX942-GISEL-NEXT:  ; %bb.11: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-GISEL-NEXT:  .LBB14_12: ; Parent Loop BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:80
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_12
-; GFX942-GISEL-NEXT:  ; %bb.13: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-GISEL-NEXT:  .LBB14_14: ; Parent Loop BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:96
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_14
-; GFX942-GISEL-NEXT:  ; %bb.15: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-GISEL-NEXT:  .LBB14_16: ; Parent Loop BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:112
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_16
-; GFX942-GISEL-NEXT:  ; %bb.17: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-GISEL-NEXT:  .LBB14_18: ; Parent Loop BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:128
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_18
-; GFX942-GISEL-NEXT:  ; %bb.19: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-GISEL-NEXT:  .LBB14_20: ; Parent Loop BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:144
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_20
-; GFX942-GISEL-NEXT:  ; %bb.21: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-GISEL-NEXT:  .LBB14_22: ; Parent Loop BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:160
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_22
-; GFX942-GISEL-NEXT:  ; %bb.23: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-GISEL-NEXT:  .LBB14_24: ; Parent Loop BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:176
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_24
-; GFX942-GISEL-NEXT:  ; %bb.25: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-GISEL-NEXT:  .LBB14_26: ; Parent Loop BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:192
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_26
-; GFX942-GISEL-NEXT:  ; %bb.27: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-GISEL-NEXT:  .LBB14_28: ; Parent Loop BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:208
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_28
-; GFX942-GISEL-NEXT:  ; %bb.29: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-GISEL-NEXT:  .LBB14_30: ; Parent Loop BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:224
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_30
-; GFX942-GISEL-NEXT:  ; %bb.31: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-GISEL-NEXT:  .LBB14_32: ; Parent Loop BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[8:11], v5, s[4:7], 0 offen offset:240
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_32
-; GFX942-GISEL-NEXT:  ; %bb.33: ; in Loop: Header=BB14_1 Depth=1
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 32, v6
-; GFX942-GISEL-NEXT:    v_add_u32_e32 v5, 0x2000, v5
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
-; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
-; GFX942-GISEL-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v2, vcc, 32, v2
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:16
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:32
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:48
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:64
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:80
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:96
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:112
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:128
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:144
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:160
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:176
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:192
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:208
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:224
+; GFX942-GISEL-NEXT:    buffer_store_dwordx4 v[4:7], v1, s[0:3], 0 offen offset:240
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[4:5], vcc, -1
+; GFX942-GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v1, 0x2000, v1
+; GFX942-GISEL-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; GFX942-GISEL-NEXT:    s_cbranch_vccnz .LBB14_1
-; GFX942-GISEL-NEXT:  ; %bb.34: ; %memset.pattern-expansion-residual-body.preheader
-; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX942-GISEL-NEXT:  .LBB14_35: ; =>This Inner Loop Header: Depth=1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v2
-; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX942-GISEL-NEXT:    s_nop 0
-; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX942-GISEL-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX942-GISEL-NEXT:    buffer_store_dwordx2 v[8:9], v4, s[4:7], 0 offen offset:256
-; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr0
-; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr8_vgpr9
-; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr4
-; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX942-GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB14_35
-; GFX942-GISEL-NEXT:  ; %bb.36:
-; GFX942-GISEL-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-GISEL-NEXT:  ; %bb.2: ; %memset.pattern-expansion-residual-body.preheader
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, s16, v0
+; GFX942-GISEL-NEXT:    buffer_store_dwordx2 v[4:5], v0, s[0:3], 0 offen offset:256
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  call void @llvm.experimental.memset.pattern(ptr addrspace(7) %a, i64 %val, i64 33, i1 false)
+  %p = getelementptr inbounds i8, ptr addrspace(7) %a, i32 %offset
+  call void @llvm.experimental.memset.pattern(ptr addrspace(7) %p, i64 %val, i64 33, i1 false)
   ret void
 }



More information about the llvm-branch-commits mailing list