[llvm-branch-commits] [llvm] [LowerMemIntrinsics] Optimize memset lowering (PR #169040)
Fabian Ritter via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Nov 21 05:53:03 PST 2025
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/169040
>From 3422b79e4b9b0122927402ae5686b80e04d40ac0 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Wed, 20 Aug 2025 04:20:16 -0400
Subject: [PATCH] [LowerMemIntrinsics] Optimize memset lowering
This patch changes the memset lowering to match the optimized memcpy lowering.
The memset lowering now queries TTI.getMemcpyLoopLoweringType for a preferred
memory access type. If that type is larger than a byte, the memset is lowered
into two loops: a main loop that stores a sufficiently wide vector splat of the
SetValue with the preferred memory access type and a residual loop that covers
the remaining bytes individually. If the memset size is statically known, the
residual loop is replaced by a sequence of stores.
This improves memset performance on gfx1030 (AMDGPU) in microbenchmarks by
around 7-20x.
I'm planning similar treatment for memset.pattern as a follow-up PR.
For SWDEV-543208.
---
.../Transforms/Utils/LowerMemIntrinsics.h | 3 +-
llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp | 6 +-
.../AMDGPU/AMDGPULowerBufferFatPointers.cpp | 3 +-
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 3 +-
.../lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp | 2 +-
.../Target/SPIRV/SPIRVPrepareFunctions.cpp | 11 +-
.../Transforms/Utils/LowerMemIntrinsics.cpp | 204 +-
.../CodeGen/AMDGPU/GlobalISel/llvm.memset.ll | 114 +-
.../local-stack-alloc-block-sp-reference.ll | 776 ++++++-
.../lower-buffer-fat-pointers-mem-transfer.ll | 334 ++-
.../AMDGPU/lower-mem-intrinsics-threshold.ll | 51 +-
.../CodeGen/AMDGPU/lower-mem-intrinsics.ll | 68 +-
.../CodeGen/AMDGPU/memintrinsic-unroll.ll | 1616 ++++++++++++++
.../AMDGPU/memset-param-combinations.ll | 1900 +++++++++++++++++
llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll | 8 +-
.../CodeGen/SPIRV/llvm-intrinsics/memset.ll | 4 +-
.../X86/memset-inline-non-constant-len.ll | 24 +-
17 files changed, 4826 insertions(+), 301 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll
diff --git a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
index d4e72a60fc1ea..8924b8b1e6e54 100644
--- a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
+++ b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
@@ -59,7 +59,8 @@ LLVM_ABI bool expandMemMoveAsLoop(MemMoveInst *MemMove,
const TargetTransformInfo &TTI);
/// Expand \p MemSet as a loop. \p MemSet is not deleted.
-LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet);
+LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet,
+ const TargetTransformInfo &TTI);
/// Expand \p MemSetPattern as a loop. \p MemSet is not deleted.
LLVM_ABI void expandMemSetPatternAsLoop(MemSetPatternInst *MemSet);
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index d738dc4eea36d..88e2bb81f9e3b 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -369,7 +369,7 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
canEmitLibcall(TM, ParentFunc, RTLIB::MEMSET))
break;
- expandMemSetAsLoop(Memset);
+ expandMemSetAsLoop(Memset, TTI);
Changed = true;
Memset->eraseFromParent();
}
@@ -384,7 +384,9 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
if (isa<ConstantInt>(Memset->getLength()))
break;
- expandMemSetAsLoop(Memset);
+ Function *ParentFunc = Memset->getFunction();
+ const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
+ expandMemSetAsLoop(Memset, TTI);
Changed = true;
Memset->eraseFromParent();
break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index fdff21b6ef8df..76f1e006bbf74 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -635,7 +635,8 @@ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst(
MemSetInst &MSI) {
if (MSI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
return false;
- llvm::expandMemSetAsLoop(&MSI);
+ llvm::expandMemSetAsLoop(&MSI,
+ TM->getTargetTransformInfo(*MSI.getFunction()));
MSI.eraseFromParent();
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 03d16fdd54c42..5a68dca1b10b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -80,7 +80,8 @@ static cl::opt<size_t> InlineMaxBB(
static cl::opt<unsigned> MemcpyLoopUnroll(
"amdgpu-memcpy-loop-unroll",
cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
- "operations when lowering memcpy as a loop"),
+ "operations when lowering statically-sized memcpy, memmove, or"
+ "memset as a loop"),
cl::init(16), cl::Hidden);
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index ac6f4061b9f1f..d0b50d2610bd5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -128,7 +128,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
} else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) {
expandMemMoveAsLoop(Memmove, TTI);
} else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) {
- expandMemSetAsLoop(Memset);
+ expandMemSetAsLoop(Memset, TTI);
}
MemCall->eraseFromParent();
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index be88f334d2171..8bee6da75cc75 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -23,6 +23,7 @@
#include "SPIRVTargetMachine.h"
#include "SPIRVUtils.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/IR/IRBuilder.h"
@@ -93,7 +94,8 @@ static Function *getOrCreateFunction(Module *M, Type *RetTy,
return NewF;
}
-static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic) {
+static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic,
+ const TargetTransformInfo &TTI) {
// For @llvm.memset.* intrinsic cases with constant value and length arguments
// are emulated via "storing" a constant array to the destination. For other
// cases we wrap the intrinsic in @spirv.llvm_memset_* function and expand the
@@ -137,7 +139,7 @@ static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic) {
auto *MemSet = IRB.CreateMemSet(Dest, Val, Len, MSI->getDestAlign(),
MSI->isVolatile());
IRB.CreateRetVoid();
- expandMemSetAsLoop(cast<MemSetInst>(MemSet));
+ expandMemSetAsLoop(cast<MemSetInst>(MemSet), TTI);
MemSet->eraseFromParent();
break;
}
@@ -399,6 +401,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
bool Changed = false;
const SPIRVSubtarget &STI = TM.getSubtarget<SPIRVSubtarget>(*F);
SmallVector<Instruction *> EraseFromParent;
+ const TargetTransformInfo &TTI = TM.getTargetTransformInfo(*F);
for (BasicBlock &BB : *F) {
for (Instruction &I : make_early_inc_range(BB)) {
auto Call = dyn_cast<CallInst>(&I);
@@ -411,7 +414,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
switch (II->getIntrinsicID()) {
case Intrinsic::memset:
case Intrinsic::bswap:
- Changed |= lowerIntrinsicToFunction(II);
+ Changed |= lowerIntrinsicToFunction(II, TTI);
break;
case Intrinsic::fshl:
case Intrinsic::fshr:
@@ -459,7 +462,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
return false;
return II->getCalledFunction()->getName().starts_with(Prefix);
}))
- Changed |= lowerIntrinsicToFunction(II);
+ Changed |= lowerIntrinsicToFunction(II, TTI);
break;
}
}
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 88e5d038bff82..07bfceb99d206 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -930,9 +930,187 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
}
}
+/// Create a Value of \p DstType that consists of a sequence of copies of
+/// \p SetValue, using bitcasts and a vector splat.
+static Value *createMemSetSplat(const DataLayout &DL, IRBuilderBase &B,
+ Value *SetValue, Type *DstType) {
+ unsigned DstSize = DL.getTypeStoreSize(DstType);
+ Type *SetValueType = SetValue->getType();
+ unsigned SetValueSize = DL.getTypeStoreSize(SetValueType);
+ assert(SetValueSize == DL.getTypeAllocSize(SetValueType) &&
+ "Store size and alloc size of SetValue's type must match");
+ assert(SetValueSize != 0 && DstSize % SetValueSize == 0 &&
+ "DstType size must be a multiple of SetValue size");
+
+ Value *Result = SetValue;
+ if (DstSize != SetValueSize) {
+ if (!SetValueType->isIntegerTy() && !SetValueType->isFloatingPointTy()) {
+ // If the type cannot be put into a vector, bitcast to iN first.
+ LLVMContext &Ctx = SetValue->getContext();
+ Result = B.CreateBitCast(Result, Type::getIntNTy(Ctx, SetValueSize * 8),
+ "setvalue.toint");
+ }
+ // Form a sufficiently large vector consisting of SetValue, repeated.
+ Result =
+ B.CreateVectorSplat(DstSize / SetValueSize, Result, "setvalue.splat");
+ }
+
+ // The value has the right size, but we might have to bitcast it to the right
+ // type.
+ if (Result->getType() != DstType) {
+ Result = B.CreateBitCast(Result, DstType, "setvalue.splat.cast");
+ }
+ return Result;
+}
+
+static void createMemSetLoopKnownSize(Instruction *InsertBefore, Value *DstAddr,
+ ConstantInt *Len, Value *SetValue,
+ Align DstAlign, bool IsVolatile,
+ const TargetTransformInfo &TTI) {
+ // No need to expand zero length memsets.
+ if (Len->isZero())
+ return;
+
+ BasicBlock *PreLoopBB = InsertBefore->getParent();
+ Function *ParentFunc = PreLoopBB->getParent();
+ const DataLayout &DL = ParentFunc->getDataLayout();
+ LLVMContext &Ctx = PreLoopBB->getContext();
+
+ unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+ Type *TypeOfLen = Len->getType();
+ Type *Int8Type = Type::getInt8Ty(Ctx);
+ assert(SetValue->getType() == Int8Type && "Can only set bytes");
+
+ // Use the same memory access type as for a memcpy with the same Dst and Src
+ // alignment and address space.
+ Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+ Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt);
+ unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+
+ uint64_t LoopEndCount = alignDown(Len->getZExtValue(), LoopOpSize);
+
+ if (LoopEndCount != 0) {
+ Value *SplatSetValue = nullptr;
+ {
+ IRBuilder<> PreLoopBuilder(InsertBefore);
+ SplatSetValue =
+ createMemSetSplat(DL, PreLoopBuilder, SetValue, LoopOpType);
+ }
+
+ // Don't generate a residual loop, the remaining bytes are set with
+ // straight-line code.
+ LoopExpansionInfo LEI =
+ insertLoopExpansion(InsertBefore, Len, LoopOpSize, 0, "static-memset");
+
+ // Fill MainLoopBB
+ IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
+ Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
+
+ Value *DstGEP =
+ MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
+
+ MainLoopBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
+ IsVolatile);
+
+ assert(!LEI.ResidualLoopIP && !LEI.ResidualLoopIndex &&
+ "No residual loop was requested");
+ }
+
+ uint64_t BytesSet = LoopEndCount;
+ uint64_t RemainingBytes = Len->getZExtValue() - BytesSet;
+ if (RemainingBytes == 0)
+ return;
+
+ IRBuilder<> RBuilder(InsertBefore);
+
+ SmallVector<Type *, 5> RemainingOps;
+ TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+ DstAS, DstAS, DstAlign, DstAlign,
+ std::nullopt);
+
+ Type *PreviousOpTy = nullptr;
+ Value *SplatSetValue = nullptr;
+ for (auto *OpTy : RemainingOps) {
+ unsigned OperandSize = DL.getTypeStoreSize(OpTy);
+ Align PartDstAlign(commonAlignment(DstAlign, BytesSet));
+
+ // Avoid recomputing the splat SetValue if it's the same as for the last
+ // iteration.
+ if (OpTy != PreviousOpTy)
+ SplatSetValue = createMemSetSplat(DL, RBuilder, SetValue, OpTy);
+
+ Value *DstGEP = RBuilder.CreateInBoundsGEP(
+ Int8Type, DstAddr, ConstantInt::get(TypeOfLen, BytesSet));
+ RBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
+ IsVolatile);
+ BytesSet += OperandSize;
+ PreviousOpTy = OpTy;
+ }
+ assert(BytesSet == Len->getZExtValue() &&
+ "Bytes set should match size in the call!");
+}
+
+static void createMemSetLoopUnknownSize(Instruction *InsertBefore,
+ Value *DstAddr, Value *Len,
+ Value *SetValue, Align DstAlign,
+ bool IsVolatile,
+ const TargetTransformInfo &TTI) {
+ BasicBlock *PreLoopBB = InsertBefore->getParent();
+ Function *ParentFunc = PreLoopBB->getParent();
+ const DataLayout &DL = ParentFunc->getDataLayout();
+ LLVMContext &Ctx = PreLoopBB->getContext();
+
+ unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+ Type *Int8Type = Type::getInt8Ty(Ctx);
+ assert(SetValue->getType() == Int8Type && "Can only set bytes");
+
+ Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+ Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt);
+ unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+
+ Type *ResidualLoopOpType = Int8Type;
+ unsigned ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
+
+ Value *SplatSetValue = SetValue;
+ {
+ IRBuilder<> PreLoopBuilder(InsertBefore);
+ SplatSetValue = createMemSetSplat(DL, PreLoopBuilder, SetValue, LoopOpType);
+ }
+
+ LoopExpansionInfo LEI = insertLoopExpansion(
+ InsertBefore, Len, LoopOpSize, ResidualLoopOpSize, "dynamic-memset");
+
+ // Fill MainLoopBB
+ IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
+ Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
+
+ Value *DstGEP =
+ MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
+ MainLoopBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
+ IsVolatile);
+
+ // Fill ResidualLoopBB
+ if (!LEI.ResidualLoopIP)
+ return;
+
+ Align ResDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));
+
+ IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
+
+ Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
+ LEI.ResidualLoopIndex);
+ ResLoopBuilder.CreateAlignedStore(SetValue, ResDstGEP, ResDstAlign,
+ IsVolatile);
+}
+
static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
Value *CopyLen, Value *SetValue, Align DstAlign,
bool IsVolatile) {
+ // Currently no longer used for memset, only for memset.pattern.
+ // TODO: Update the memset.pattern lowering to also use the loop expansion
+ // framework and remove this function.
Type *TypeOfCopyLen = CopyLen->getType();
BasicBlock *OrigBB = InsertBefore->getParent();
Function *F = OrigBB->getParent();
@@ -1067,13 +1245,25 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
return true;
}
-void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
- createMemSetLoop(/* InsertBefore */ Memset,
- /* DstAddr */ Memset->getRawDest(),
- /* CopyLen */ Memset->getLength(),
- /* SetValue */ Memset->getValue(),
- /* Alignment */ Memset->getDestAlign().valueOrOne(),
- Memset->isVolatile());
+void llvm::expandMemSetAsLoop(MemSetInst *Memset,
+ const TargetTransformInfo &TTI) {
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Memset->getLength())) {
+ createMemSetLoopKnownSize(
+ /* InsertBefore */ Memset,
+ /* DstAddr */ Memset->getRawDest(),
+ /* Len */ CI,
+ /* SetValue */ Memset->getValue(),
+ /* DstAlign */ Memset->getDestAlign().valueOrOne(),
+ Memset->isVolatile(), TTI);
+ } else {
+ createMemSetLoopUnknownSize(
+ /* InsertBefore */ Memset,
+ /* DstAddr */ Memset->getRawDest(),
+ /* Len */ Memset->getLength(),
+ /* SetValue */ Memset->getValue(),
+ /* DstAlign */ Memset->getDestAlign().valueOrOne(),
+ Memset->isVolatile(), TTI);
+ }
}
void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
index 04652af147f9b..4d35f3198bc0a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
@@ -1,27 +1,87 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-- -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=35 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=37 %s -o - | FileCheck -check-prefix=UNROLL %s
declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1)
define amdgpu_cs void @memset_p1i8(ptr addrspace(1) %dst, i8 %val) {
; LOOP-LABEL: memset_p1i8:
-; LOOP: ; %bb.0: ; %loadstoreloop.preheader
+; LOOP: ; %bb.0:
+; LOOP-NEXT: v_and_b32_e32 v3, 0xff, v2
; LOOP-NEXT: s_mov_b64 s[0:1], 0
; LOOP-NEXT: s_mov_b32 s2, 0
; LOOP-NEXT: s_mov_b32 s3, 0xf000
+; LOOP-NEXT: v_lshlrev_b32_e32 v4, 8, v3
+; LOOP-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; LOOP-NEXT: v_lshlrev_b32_e32 v6, 24, v3
+; LOOP-NEXT: v_or_b32_e32 v3, v3, v4
+; LOOP-NEXT: v_or_b32_e32 v3, v3, v5
+; LOOP-NEXT: v_or_b32_e32 v5, v3, v6
+; LOOP-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; LOOP-NEXT: v_bfe_u32 v7, v5, 8, 8
+; LOOP-NEXT: v_lshrrev_b32_e32 v8, 24, v5
; LOOP-NEXT: v_mov_b32_e32 v4, s1
; LOOP-NEXT: v_mov_b32_e32 v3, s0
-; LOOP-NEXT: .LBB0_1: ; %loadstoreloop
+; LOOP-NEXT: .LBB0_1: ; %static-memset-expansion-main-body
; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1
-; LOOP-NEXT: v_add_i32_e32 v5, vcc, v0, v3
-; LOOP-NEXT: v_addc_u32_e32 v6, vcc, v1, v4, vcc
-; LOOP-NEXT: v_add_i32_e32 v3, vcc, 1, v3
+; LOOP-NEXT: v_add_i32_e32 v9, vcc, v0, v3
+; LOOP-NEXT: v_addc_u32_e32 v10, vcc, v1, v4, vcc
+; LOOP-NEXT: v_add_i32_e32 v3, vcc, 32, v3
; LOOP-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; LOOP-NEXT: v_cmp_gt_u32_e32 vcc, 4, v3
-; LOOP-NEXT: buffer_store_byte v2, v[5:6], s[0:3], 0 addr64
+; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64
+; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:2
+; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:4
+; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:5
+; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:6
+; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:7
+; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:8
+; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:9
+; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:10
+; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:11
+; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:12
+; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:13
+; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:14
+; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:15
+; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:16
+; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:17
+; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:18
+; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:19
+; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:20
+; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:21
+; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:22
+; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:23
+; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:24
+; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:25
+; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:26
+; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:27
+; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:28
+; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:29
+; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:30
+; LOOP-NEXT: v_cmp_gt_u32_e32 vcc, 32, v3
+; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:31
; LOOP-NEXT: s_cbranch_vccnz .LBB0_1
-; LOOP-NEXT: ; %bb.2: ; %split
+; LOOP-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; LOOP-NEXT: v_and_b32_e32 v2, 0xff, v2
+; LOOP-NEXT: s_mov_b32 s2, 0
+; LOOP-NEXT: s_mov_b32 s3, 0xf000
+; LOOP-NEXT: s_mov_b64 s[0:1], 0
+; LOOP-NEXT: v_lshlrev_b32_e32 v3, 8, v2
+; LOOP-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; LOOP-NEXT: s_waitcnt expcnt(3)
+; LOOP-NEXT: v_lshlrev_b32_e32 v5, 24, v2
+; LOOP-NEXT: v_or_b32_e32 v2, v2, v3
+; LOOP-NEXT: v_or_b32_e32 v2, v2, v4
+; LOOP-NEXT: v_or_b32_e32 v2, v2, v5
+; LOOP-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; LOOP-NEXT: v_bfe_u32 v4, v2, 8, 8
+; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:32
+; LOOP-NEXT: s_waitcnt expcnt(0)
+; LOOP-NEXT: v_lshrrev_b32_e32 v2, 24, v2
+; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:33
+; LOOP-NEXT: buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:34
+; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:35
; LOOP-NEXT: s_endpgm
;
; UNROLL-LABEL: memset_p1i8:
@@ -33,7 +93,39 @@ define amdgpu_cs void @memset_p1i8(ptr addrspace(1) %dst, i8 %val) {
; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:1
; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:2
; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:3
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:4
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:5
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:6
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:7
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:8
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:9
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:10
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:11
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:12
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:13
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:14
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:15
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:16
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:17
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:18
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:19
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:20
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:21
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:22
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:23
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:24
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:25
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:26
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:27
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:28
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:29
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:30
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:31
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:32
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:33
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:34
+; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:35
; UNROLL-NEXT: s_endpgm
- call void @llvm.memset.p1.i32(ptr addrspace(1) %dst, i8 %val, i32 4, i1 false)
+ call void @llvm.memset.p1.i32(ptr addrspace(1) %dst, i8 %val, i32 36, i1 false)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
index 5f0ca7bc42ae0..097d36b190299 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@@ -28,31 +28,179 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) {
; MUBUF-NEXT: s_mov_b32 s4, 0
; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: .LBB0_1: ; %loadstoreloop
+; MUBUF-NEXT: .LBB0_1: ; %static-memset-expansion-main-body
; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1
; MUBUF-NEXT: v_mov_b32_e32 v3, 0x3000
; MUBUF-NEXT: v_add_u32_e32 v2, s4, v3
-; MUBUF-NEXT: s_add_i32 s4, s4, 1
-; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120
-; MUBUF-NEXT: buffer_store_byte v1, v2, s[0:3], 0 offen
+; MUBUF-NEXT: s_addk_i32 s4, 0x100
+; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2100
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:252
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:248
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:244
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:240
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:236
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:232
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:228
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:224
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:220
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:216
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:212
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:208
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:204
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:200
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:196
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:192
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:188
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:184
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:180
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:176
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:172
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:168
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:164
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:160
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:156
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:152
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:148
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:144
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:140
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:136
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:132
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:128
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:124
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:120
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:116
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:112
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:108
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:104
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:100
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:96
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:92
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:88
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:84
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:80
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:76
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:72
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:68
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:64
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:60
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:56
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:52
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:48
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:44
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:40
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:36
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:32
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:28
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:24
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:20
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:12
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:8
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_cbranch_scc1 .LBB0_1
-; MUBUF-NEXT: ; %bb.2: ; %split
-; MUBUF-NEXT: v_mov_b32_e32 v1, 0x50d0
-; MUBUF-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen glc
+; MUBUF-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; MUBUF-NEXT: v_mov_b32_e32 v1, 0x5100
+; MUBUF-NEXT: v_mov_b32_e32 v2, 0
+; MUBUF-NEXT: s_movk_i32 s4, 0x2110
+; MUBUF-NEXT: v_mov_b32_e32 v3, 0x3000
+; MUBUF-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen offset:12
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen offset:8
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_add_u32_e32 v1, s4, v3
+; MUBUF-NEXT: s_movk_i32 s4, 0x20d0
+; MUBUF-NEXT: v_mov_b32_e32 v3, 0x3000
+; MUBUF-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen offset:12
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen offset:8
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_add_u32_e32 v1, s4, v3
+; MUBUF-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 glc
+; MUBUF-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:4 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen glc
+; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:4 glc
+; MUBUF-NEXT: buffer_load_dword v6, v0, s[0:3], 0 offen offset:4 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; MUBUF-NEXT: v_mov_b32_e32 v6, 0
-; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v2, v4
-; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v5, vcc
+; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v3, v5
+; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v6, vcc
; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
-; MUBUF-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5]
+; MUBUF-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_endpgm
;
@@ -65,20 +213,69 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) {
; FLATSCR-NEXT: scratch_store_dword off, v0, s0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_mov_b32 s0, 0
-; FLATSCR-NEXT: .LBB0_1: ; %loadstoreloop
+; FLATSCR-NEXT: s_mov_b32 s1, s0
+; FLATSCR-NEXT: s_mov_b32 s2, s0
+; FLATSCR-NEXT: s_mov_b32 s3, s0
+; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
+; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
+; FLATSCR-NEXT: v_mov_b32_e32 v2, s2
+; FLATSCR-NEXT: v_mov_b32_e32 v3, s3
+; FLATSCR-NEXT: .LBB0_1: ; %static-memset-expansion-main-body
; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
; FLATSCR-NEXT: s_add_i32 s1, s0, 0x3000
-; FLATSCR-NEXT: s_add_i32 s0, s0, 1
-; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120
-; FLATSCR-NEXT: scratch_store_byte off, v0, s1
+; FLATSCR-NEXT: s_addk_i32 s0, 0x100
+; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2100
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:240
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:224
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:208
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:192
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:176
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:160
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:144
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:128
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:112
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:96
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:80
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:64
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:48
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:32
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_1
-; FLATSCR-NEXT: ; %bb.2: ; %split
+; FLATSCR-NEXT: ; %bb.2: ; %static-memset-post-expansion
; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
-; FLATSCR-NEXT: s_addk_i32 s0, 0x3000
-; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s0 offset:208 glc
+; FLATSCR-NEXT: s_add_i32 s6, s0, 0x3000
+; FLATSCR-NEXT: s_mov_b32 s0, 0
+; FLATSCR-NEXT: s_mov_b32 s1, s0
+; FLATSCR-NEXT: s_mov_b32 s2, s0
+; FLATSCR-NEXT: s_mov_b32 s3, s0
+; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
+; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
+; FLATSCR-NEXT: v_mov_b32_e32 v2, s2
+; FLATSCR-NEXT: v_mov_b32_e32 v3, s3
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s6 offset:256
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s6 offset:272
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_movk_i32 s0, 0x3000
+; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s6 offset:208 glc
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:64 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
@@ -121,20 +318,173 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
; MUBUF-NEXT: s_add_i32 s32, s32, 0x200000
; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], s33 offen
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: .LBB1_1: ; %loadstoreloop
+; MUBUF-NEXT: .LBB1_1: ; %static-memset-expansion-main-body
; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1
; MUBUF-NEXT: v_lshrrev_b32_e64 v5, 6, s33
; MUBUF-NEXT: v_add_u32_e32 v4, s4, v5
; MUBUF-NEXT: v_mov_b32_e32 v5, 0x3000
-; MUBUF-NEXT: s_add_i32 s4, s4, 1
+; MUBUF-NEXT: s_addk_i32 s4, 0x100
; MUBUF-NEXT: v_add_u32_e32 v4, v5, v4
-; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120
-; MUBUF-NEXT: buffer_store_byte v3, v4, s[0:3], 0 offen
+; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2100
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:252
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:248
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:244
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:240
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:236
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:232
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:228
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:224
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:220
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:216
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:212
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:208
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:204
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:200
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:196
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:192
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:188
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:184
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:180
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:176
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:172
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:168
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:164
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:160
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:156
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:152
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:148
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:144
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:140
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:136
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:132
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:128
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:124
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:120
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:116
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:112
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:108
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:104
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:100
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:96
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:92
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:88
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:84
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:80
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:76
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:72
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:68
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:64
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:60
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:56
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:52
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:48
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:44
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:40
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:36
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:32
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:28
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:24
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:20
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:16
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:12
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:8
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_cbranch_scc1 .LBB1_1
-; MUBUF-NEXT: ; %bb.2: ; %split
+; MUBUF-NEXT: ; %bb.2: ; %static-memset-post-expansion
; MUBUF-NEXT: v_lshrrev_b32_e64 v4, 6, s33
-; MUBUF-NEXT: v_add_u32_e32 v3, 0x50d0, v4
+; MUBUF-NEXT: v_add_u32_e32 v3, 0x5100, v4
+; MUBUF-NEXT: v_mov_b32_e32 v4, 0
+; MUBUF-NEXT: s_movk_i32 s4, 0x2110
+; MUBUF-NEXT: v_lshrrev_b32_e64 v5, 6, s33
+; MUBUF-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen offset:12
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen offset:8
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_add_u32_e32 v3, s4, v5
+; MUBUF-NEXT: v_mov_b32_e32 v5, 0x3000
+; MUBUF-NEXT: v_add_u32_e32 v3, v5, v3
+; MUBUF-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen offset:12
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen offset:8
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: s_movk_i32 s4, 0x20d0
+; MUBUF-NEXT: v_lshrrev_b32_e64 v4, 6, s33
+; MUBUF-NEXT: v_add_u32_e32 v3, s4, v4
+; MUBUF-NEXT: v_mov_b32_e32 v4, 0x3000
+; MUBUF-NEXT: v_add_u32_e32 v3, v4, v3
; MUBUF-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: buffer_load_dword v5, v3, s[0:3], 0 offen offset:4 glc
@@ -155,10 +505,10 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
; FLATSCR-LABEL: func_local_stack_offset_uses_sp:
; FLATSCR: ; %bb.0: ; %entry
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR-NEXT: s_mov_b32 s2, s33
+; FLATSCR-NEXT: s_mov_b32 s5, s33
; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff
; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000
-; FLATSCR-NEXT: s_mov_b32 s3, s34
+; FLATSCR-NEXT: s_mov_b32 s6, s34
; FLATSCR-NEXT: s_mov_b32 s34, s32
; FLATSCR-NEXT: s_add_i32 s32, s32, 0x8000
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
@@ -166,27 +516,76 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
; FLATSCR-NEXT: scratch_store_dword off, v2, s0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_mov_b32 s0, 0
-; FLATSCR-NEXT: .LBB1_1: ; %loadstoreloop
+; FLATSCR-NEXT: s_mov_b32 s3, s0
+; FLATSCR-NEXT: s_mov_b32 s1, s0
+; FLATSCR-NEXT: s_mov_b32 s2, s0
+; FLATSCR-NEXT: v_mov_b32_e32 v5, s3
+; FLATSCR-NEXT: v_mov_b32_e32 v4, s2
+; FLATSCR-NEXT: v_mov_b32_e32 v3, s1
+; FLATSCR-NEXT: v_mov_b32_e32 v2, s0
+; FLATSCR-NEXT: .LBB1_1: ; %static-memset-expansion-main-body
; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
; FLATSCR-NEXT: s_add_i32 s1, s33, s0
; FLATSCR-NEXT: s_addk_i32 s1, 0x3000
-; FLATSCR-NEXT: s_add_i32 s0, s0, 1
-; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120
-; FLATSCR-NEXT: scratch_store_byte off, v2, s1
+; FLATSCR-NEXT: s_addk_i32 s0, 0x100
+; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2100
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:240
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:224
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:208
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:192
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:176
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:160
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:144
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:128
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:112
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:96
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:80
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:64
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:48
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:32
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1 offset:16
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s1
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_1
-; FLATSCR-NEXT: ; %bb.2: ; %split
+; FLATSCR-NEXT: ; %bb.2: ; %static-memset-post-expansion
; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
-; FLATSCR-NEXT: s_add_i32 s1, s33, s0
-; FLATSCR-NEXT: s_add_i32 s0, s1, 0x3000
-; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208 glc
+; FLATSCR-NEXT: s_add_i32 s4, s33, s0
+; FLATSCR-NEXT: s_mov_b32 s0, 0
+; FLATSCR-NEXT: s_mov_b32 s3, s0
+; FLATSCR-NEXT: s_mov_b32 s1, s0
+; FLATSCR-NEXT: s_mov_b32 s2, s0
+; FLATSCR-NEXT: v_mov_b32_e32 v5, s3
+; FLATSCR-NEXT: s_addk_i32 s4, 0x3000
+; FLATSCR-NEXT: v_mov_b32_e32 v4, s2
+; FLATSCR-NEXT: v_mov_b32_e32 v3, s1
+; FLATSCR-NEXT: v_mov_b32_e32 v2, s0
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s4 offset:256
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[2:5], s4 offset:272
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_add_i32 s0, s33, 0x3000
+; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s4 offset:208 glc
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:64 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_mov_b32 s32, s34
-; FLATSCR-NEXT: s_mov_b32 s34, s3
-; FLATSCR-NEXT: s_mov_b32 s33, s2
+; FLATSCR-NEXT: s_mov_b32 s34, s6
+; FLATSCR-NEXT: s_mov_b32 s33, s5
; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
; FLATSCR-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
@@ -216,76 +615,222 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out
; MUBUF-NEXT: s_mov_b32 s4, 0
; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: .LBB2_1: ; %loadstoreloop
+; MUBUF-NEXT: .LBB2_1: ; %static-memset-expansion-main-body
; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1
; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
; MUBUF-NEXT: v_add_u32_e32 v1, s4, v2
-; MUBUF-NEXT: s_add_i32 s4, s4, 1
-; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120
-; MUBUF-NEXT: buffer_store_byte v0, v1, s[0:3], 0 offen
+; MUBUF-NEXT: s_addk_i32 s4, 0x100
+; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2100
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:252
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:248
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:244
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:240
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:236
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:232
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:228
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:224
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:220
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:216
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:212
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:208
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:204
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:200
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:196
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:192
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:188
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:184
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:180
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:176
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:172
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:168
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:164
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:160
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:156
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:152
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:148
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:144
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:140
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:136
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:132
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:128
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:124
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:120
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:116
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:112
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:108
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:104
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:100
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:96
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:92
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:88
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:84
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:80
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:76
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:72
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:68
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:64
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:60
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:56
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:52
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:48
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:44
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:40
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:36
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:32
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:28
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:24
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:20
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:12
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:8
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_cbranch_scc1 .LBB2_1
-; MUBUF-NEXT: ; %bb.2: ; %split
-; MUBUF-NEXT: s_movk_i32 s5, 0x12d4
-; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000
-; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1
-; MUBUF-NEXT: s_movk_i32 s5, 0x12d0
+; MUBUF-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; MUBUF-NEXT: v_mov_b32_e32 v0, 0x6100
+; MUBUF-NEXT: v_mov_b32_e32 v6, 0
+; MUBUF-NEXT: s_movk_i32 s4, 0x2110
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000
-; MUBUF-NEXT: s_movk_i32 s4, 0x4000
-; MUBUF-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen glc
+; MUBUF-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1
-; MUBUF-NEXT: s_movk_i32 s5, 0x12c4
+; MUBUF-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:4
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_add_u32_e32 v0, s4, v1
+; MUBUF-NEXT: s_movk_i32 s4, 0x12c0
; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000
-; MUBUF-NEXT: s_or_b32 s4, s4, 0x12c0
-; MUBUF-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen glc
+; MUBUF-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:4
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_or_b32_e32 v0, s5, v1
-; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen glc
+; MUBUF-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_or_b32_e32 v0, s4, v1
+; MUBUF-NEXT: s_movk_i32 s4, 0x12d4
+; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
+; MUBUF-NEXT: v_or_b32_e32 v1, s4, v2
+; MUBUF-NEXT: s_movk_i32 s4, 0x12d0
+; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
+; MUBUF-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen glc
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_or_b32_e32 v1, s4, v2
+; MUBUF-NEXT: s_movk_i32 s4, 0x12c4
+; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
+; MUBUF-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen glc
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_or_b32_e32 v1, s4, v2
+; MUBUF-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen glc
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_mov_b32_e32 v0, s4
; MUBUF-NEXT: s_movk_i32 s4, 0x12cc
-; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000
-; MUBUF-NEXT: v_or_b32_e32 v2, s4, v3
+; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000
+; MUBUF-NEXT: v_or_b32_e32 v0, s4, v1
; MUBUF-NEXT: s_movk_i32 s4, 0x12c8
-; MUBUF-NEXT: v_mov_b32_e32 v6, 0x4000
+; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
+; MUBUF-NEXT: v_or_b32_e32 v1, s4, v2
+; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_mov_b32_e32 v7, 0x4000
-; MUBUF-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen glc
-; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_or_b32_e32 v2, s4, v6
-; MUBUF-NEXT: v_mov_b32_e32 v8, 0x4000
-; MUBUF-NEXT: v_mov_b32_e32 v9, 0x4000
-; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen glc
-; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_mov_b32_e32 v10, 0x4000
-; MUBUF-NEXT: buffer_load_dword v6, v7, s[0:3], 0 offen glc
+; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000
+; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_mov_b32_e32 v11, 0x4000
-; MUBUF-NEXT: buffer_load_dword v7, v8, s[0:3], 0 offen offset:4 glc
+; MUBUF-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen glc
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000
+; MUBUF-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:4 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: v_mov_b32_e32 v12, 0x4000
-; MUBUF-NEXT: buffer_load_dword v8, v9, s[0:3], 0 offen offset:8 glc
+; MUBUF-NEXT: buffer_load_dword v2, v3, s[0:3], 0 offen offset:8 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; MUBUF-NEXT: buffer_load_dword v9, v10, s[0:3], 0 offen offset:12 glc
+; MUBUF-NEXT: v_mov_b32_e32 v13, 0x4000
+; MUBUF-NEXT: buffer_load_dword v3, v11, s[0:3], 0 offen offset:12 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v2, v8
-; MUBUF-NEXT: buffer_load_dword v10, v11, s[0:3], 0 offen offset:16 glc
+; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; MUBUF-NEXT: buffer_load_dword v11, v12, s[0:3], 0 offen offset:16 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v9, vcc
-; MUBUF-NEXT: buffer_load_dword v11, v12, s[0:3], 0 offen offset:20 glc
+; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v1, v2
+; MUBUF-NEXT: buffer_load_dword v12, v13, s[0:3], 0 offen offset:20 glc
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6
-; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v7, vcc
-; MUBUF-NEXT: v_mov_b32_e32 v12, 0
-; MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10
-; MUBUF-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v11, vcc
+; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
+; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v8, v9
+; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v10, vcc
+; MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, v4, v11
+; MUBUF-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v12, vcc
; MUBUF-NEXT: s_waitcnt lgkmcnt(0)
-; MUBUF-NEXT: global_store_dwordx2 v12, v[4:5], s[4:5] offset:16
+; MUBUF-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5] offset:16
; MUBUF-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT: global_store_dwordx4 v12, v[0:3], s[4:5]
+; MUBUF-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5]
; MUBUF-NEXT: s_waitcnt vmcnt(0)
; MUBUF-NEXT: s_endpgm
;
@@ -297,16 +842,67 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out
; FLATSCR-NEXT: s_mov_b32 s0, 0
; FLATSCR-NEXT: scratch_store_dword off, v0, s0 offset:1024
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT: .LBB2_1: ; %loadstoreloop
+; FLATSCR-NEXT: s_mov_b32 s1, s0
+; FLATSCR-NEXT: s_mov_b32 s2, s0
+; FLATSCR-NEXT: s_mov_b32 s3, s0
+; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
+; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
+; FLATSCR-NEXT: v_mov_b32_e32 v2, s2
+; FLATSCR-NEXT: v_mov_b32_e32 v3, s3
+; FLATSCR-NEXT: .LBB2_1: ; %static-memset-expansion-main-body
; FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
; FLATSCR-NEXT: s_add_i32 s1, s0, 0x2000
-; FLATSCR-NEXT: s_add_i32 s0, s0, 1
-; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2120
-; FLATSCR-NEXT: scratch_store_byte off, v0, s1
+; FLATSCR-NEXT: s_addk_i32 s0, 0x100
+; FLATSCR-NEXT: s_cmpk_lt_u32 s0, 0x2100
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:240
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:224
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:208
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:192
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:176
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:160
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:144
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:128
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:112
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:96
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:80
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:64
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:48
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:32
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1 offset:16
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s1
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_cbranch_scc1 .LBB2_1
-; FLATSCR-NEXT: ; %bb.2: ; %split
+; FLATSCR-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
+; FLATSCR-NEXT: s_add_i32 s6, s0, 0x2000
+; FLATSCR-NEXT: s_mov_b32 s0, 0
+; FLATSCR-NEXT: s_mov_b32 s1, s0
+; FLATSCR-NEXT: s_mov_b32 s2, s0
+; FLATSCR-NEXT: s_mov_b32 s3, s0
+; FLATSCR-NEXT: v_mov_b32_e32 v0, s0
+; FLATSCR-NEXT: v_mov_b32_e32 v1, s1
+; FLATSCR-NEXT: v_mov_b32_e32 v2, s2
+; FLATSCR-NEXT: v_mov_b32_e32 v3, s3
; FLATSCR-NEXT: s_movk_i32 s0, 0x1000
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s6 offset:256
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s6 offset:272
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
; FLATSCR-NEXT: s_addk_i32 s0, 0x2000
; FLATSCR-NEXT: scratch_load_dwordx2 v[8:9], off, s0 offset:720 glc
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
index 83d6f4f5882b4..a57a4a38b1cbd 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll
@@ -1328,15 +1328,45 @@ define void @memset_known(ptr addrspace(7) inreg %ptr) {
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
-; CHECK: [[LOADSTORELOOP]]:
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT: br label %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY:.*]]
+; CHECK: [[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
-; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 0), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 1), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 2), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 3)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP2]], 16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 4), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 5), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 6), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 7)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP2]], 32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 8), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 9), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 10), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 11)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP2]], 48
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 12), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 13), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 14), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 15)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP2]], 64
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 16), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 17), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 18), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 19)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP2]], 80
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 20), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 21), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 22), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 23)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP2]], 96
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 24), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 25), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 26), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 27)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP2]], 112
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 28), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 29), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 30), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 31)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP2]], 128
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 32), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 33), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 34), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 35)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP2]], 144
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 36), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 37), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 38), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 39)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP2]], 160
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 40), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 41), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 42), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 43)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP2]], 176
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 44), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 45), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 46), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 47)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP2]], 192
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 48), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 49), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 50), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 51)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP2]], 208
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 52), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 53), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 54), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 55)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP2]], 224
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 56), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 57), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 58), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 59)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP2]], 240
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 60), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 61), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 62), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 63)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 256
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 8192
-; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
-; CHECK: [[SPLIT]]:
+; CHECK-NEXT: br i1 [[TMP4]], label %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMSET_POST_LOOP_EXPANSION:.*]]
+; CHECK: [[STATIC_MEMSET_POST_LOOP_EXPANSION]]:
; CHECK-NEXT: ret void
;
call void @llvm.memset.p7.i32(ptr addrspace(7) noundef nonnull align 16 %ptr, i8 1, i32 8192, i1 false)
@@ -1348,15 +1378,9 @@ define void @memset_known_small(ptr addrspace(7) inreg %ptr) {
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
-; CHECK: [[LOADSTORELOOP]]:
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
-; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32
-; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
-; CHECK: [[SPLIT]]:
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[PTR_OFF]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[PTR_OFF]], 16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP1]], i32 0, i32 0)
; CHECK-NEXT: ret void
;
call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 32, i1 false)
@@ -1368,15 +1392,7 @@ define void @memset_known_byte(ptr addrspace(7) inreg %ptr) {
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
-; CHECK: [[LOADSTORELOOP]]:
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
-; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1
-; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
-; CHECK: [[SPLIT]]:
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[PTR_OFF]], i32 0, i32 0)
; CHECK-NEXT: ret void
;
call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 1, i1 false)
@@ -1388,15 +1404,13 @@ define void @memset_known_tail(ptr addrspace(7) inreg %ptr) {
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
-; CHECK: [[LOADSTORELOOP]]:
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]]
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 bitcast (<8 x i8> splat (i8 1) to i64), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[PTR_OFF]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[PTR_OFF]], 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 bitcast (<4 x i8> splat (i8 1) to i32), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP1]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP3:%.*]] = add nuw i32 [[PTR_OFF]], 12
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 bitcast (<2 x i8> splat (i8 1) to i16), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[PTR_OFF]], 14
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
-; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 15
-; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
-; CHECK: [[SPLIT]]:
; CHECK-NEXT: ret void
;
call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 15, i1 false)
@@ -1408,16 +1422,46 @@ define void @memset_known_i64(ptr addrspace(7) inreg %ptr) {
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
-; CHECK: [[LOADSTORELOOP]]:
-; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT: br label %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY:.*]]
+; CHECK: [[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]] ]
; CHECK-NEXT: [[DOTC:%.*]] = trunc i64 [[TMP1]] to i32
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTC]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
-; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 0), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 1), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 2), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 3)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP2]], 16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 4), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 5), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 6), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 7)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP2]], 32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 8), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 9), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 10), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 11)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP2]], 48
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 12), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 13), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 14), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 15)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP2]], 64
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 16), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 17), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 18), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 19)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP2]], 80
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 20), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 21), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 22), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 23)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP2]], 96
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 24), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 25), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 26), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 27)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP2]], 112
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 28), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 29), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 30), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 31)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP2]], 128
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 32), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 33), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 34), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 35)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP2]], 144
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 36), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 37), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 38), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 39)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP2]], 160
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 40), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 41), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 42), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 43)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP2]], 176
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 44), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 45), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 46), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 47)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP2]], 192
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 48), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 49), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 50), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 51)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP2]], 208
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 52), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 53), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 54), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 55)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP2]], 224
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 56), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 57), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 58), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 59)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP2]], 240
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 60), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 61), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 62), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 63)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 256
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8192
-; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
-; CHECK: [[SPLIT]]:
+; CHECK-NEXT: br i1 [[TMP4]], label %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMSET_POST_LOOP_EXPANSION:.*]]
+; CHECK: [[STATIC_MEMSET_POST_LOOP_EXPANSION]]:
; CHECK-NEXT: ret void
;
call void @llvm.memset.p7.i64(ptr addrspace(7) %ptr, i8 1, i64 8192, i1 false)
@@ -1429,15 +1473,9 @@ define void @memset_known_i32_volatile(ptr addrspace(7) inreg %ptr) {
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
-; CHECK: [[LOADSTORELOOP]]:
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648)
-; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32
-; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
-; CHECK: [[SPLIT]]:
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[PTR_OFF]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[PTR_OFF]], 16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP1]], i32 0, i32 -2147483648)
; CHECK-NEXT: ret void
;
call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 32, i1 true)
@@ -1449,16 +1487,29 @@ define void @memset_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %length) {
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]], i32 inreg [[LENGTH:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 0, [[LENGTH]]
-; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
-; CHECK: [[LOADSTORELOOP]]:
-; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[LENGTH]], 15
+; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[LENGTH]], [[TMP1]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP12]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY:.*]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_COND:.*]]
+; CHECK: [[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[PTR_OFF]], [[TMP2]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0)
-; CHECK-NEXT: [[TMP4]] = add i32 [[TMP2]], 1
-; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[LENGTH]]
-; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
-; CHECK: [[SPLIT]]:
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP5]] = add i32 [[TMP2]], 16
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT: br i1 [[TMP6]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_COND]]
+; CHECK: [[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_COND]]:
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[TMP7]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_BODY:.*]], label %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION:.*]]
+; CHECK: [[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_BODY]]:
+; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_COND]] ], [ [[TMP10:%.*]], %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_BODY]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP4]], [[RESIDUAL_LOOP_INDEX]]
+; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[PTR_OFF]], [[TMP8]]
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP9]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP10]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
+; CHECK-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP1]]
+; CHECK-NEXT: br i1 [[TMP11]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_BODY]], label %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION]]
+; CHECK: [[DYNAMIC_MEMSET_POST_LOOP_EXPANSION]]:
; CHECK-NEXT: ret void
;
call void @llvm.memset.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 %length, i1 false)
@@ -1475,15 +1526,45 @@ define void @memset.inline_known(ptr addrspace(7) inreg %ptr) {
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
-; CHECK: [[LOADSTORELOOP]]:
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT: br label %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY:.*]]
+; CHECK: [[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
-; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 0), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 1), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 2), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 3)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP2]], 16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 4), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 5), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 6), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 7)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP2]], 32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 8), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 9), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 10), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 11)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP2]], 48
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 12), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 13), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 14), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 15)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP2]], 64
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 16), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 17), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 18), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 19)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP2]], 80
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 20), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 21), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 22), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 23)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP2]], 96
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 24), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 25), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 26), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 27)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP2]], 112
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 28), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 29), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 30), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 31)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP2]], 128
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 32), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 33), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 34), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 35)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP2]], 144
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 36), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 37), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 38), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 39)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP2]], 160
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 40), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 41), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 42), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 43)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP2]], 176
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 44), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 45), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 46), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 47)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP2]], 192
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 48), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 49), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 50), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 51)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP2]], 208
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 52), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 53), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 54), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 55)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP2]], 224
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 56), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 57), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 58), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 59)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP2]], 240
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 60), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 61), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 62), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 63)>, ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 256
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 8192
-; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
-; CHECK: [[SPLIT]]:
+; CHECK-NEXT: br i1 [[TMP4]], label %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMSET_POST_LOOP_EXPANSION:.*]]
+; CHECK: [[STATIC_MEMSET_POST_LOOP_EXPANSION]]:
; CHECK-NEXT: ret void
;
call void @llvm.memset.inline.p7.i32(ptr addrspace(7) noundef nonnull align 16 %ptr, i8 1, i32 8192, i1 false)
@@ -1495,15 +1576,9 @@ define void @memset.inline_known_small(ptr addrspace(7) inreg %ptr) {
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
-; CHECK: [[LOADSTORELOOP]]:
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
-; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32
-; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
-; CHECK: [[SPLIT]]:
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[PTR_OFF]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[PTR_OFF]], 16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP1]], i32 0, i32 0)
; CHECK-NEXT: ret void
;
call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 32, i1 false)
@@ -1515,15 +1590,7 @@ define void @memset.inline_known_byte(ptr addrspace(7) inreg %ptr) {
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
-; CHECK: [[LOADSTORELOOP]]:
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
-; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 1
-; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
-; CHECK: [[SPLIT]]:
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[PTR_OFF]], i32 0, i32 0)
; CHECK-NEXT: ret void
;
call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 1, i1 false)
@@ -1535,15 +1602,13 @@ define void @memset.inline_known_tail(ptr addrspace(7) inreg %ptr) {
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
-; CHECK: [[LOADSTORELOOP]]:
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]]
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i64(i64 bitcast (<8 x i8> splat (i8 1) to i64), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[PTR_OFF]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[PTR_OFF]], 8
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 bitcast (<4 x i8> splat (i8 1) to i32), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP1]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP3:%.*]] = add nuw i32 [[PTR_OFF]], 12
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 bitcast (<2 x i8> splat (i8 1) to i16), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[PTR_OFF]], 14
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
-; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 15
-; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
-; CHECK: [[SPLIT]]:
; CHECK-NEXT: ret void
;
call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 15, i1 false)
@@ -1555,16 +1620,46 @@ define void @memset.inline_known_i64(ptr addrspace(7) inreg %ptr) {
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
-; CHECK: [[LOADSTORELOOP]]:
-; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT: br label %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY:.*]]
+; CHECK: [[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]] ]
; CHECK-NEXT: [[DOTC:%.*]] = trunc i64 [[TMP1]] to i32
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTC]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
-; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 0), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 1), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 2), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 3)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP2]], 16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 4), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 5), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 6), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 7)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP2]], 32
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 8), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 9), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 10), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 11)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP2]], 48
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 12), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 13), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 14), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 15)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP2]], 64
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 16), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 17), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 18), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 19)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP2]], 80
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 20), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 21), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 22), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 23)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP2]], 96
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 24), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 25), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 26), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 27)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP2]], 112
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 28), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 29), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 30), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 31)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP2]], 128
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 32), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 33), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 34), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 35)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP2]], 144
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 36), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 37), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 38), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 39)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP2]], 160
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 40), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 41), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 42), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 43)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP2]], 176
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 44), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 45), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 46), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 47)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP2]], 192
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 48), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 49), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 50), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 51)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP2]], 208
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 52), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 53), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 54), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 55)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP2]], 224
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 56), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 57), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 58), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 59)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
+; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP2]], 240
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> <i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 60), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 61), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 62), i32 extractelement (<64 x i32> bitcast (<256 x i8> splat (i8 1) to <64 x i32>), i32 63)>, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 256
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8192
-; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
-; CHECK: [[SPLIT]]:
+; CHECK-NEXT: br i1 [[TMP4]], label %[[STATIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]], label %[[STATIC_MEMSET_POST_LOOP_EXPANSION:.*]]
+; CHECK: [[STATIC_MEMSET_POST_LOOP_EXPANSION]]:
; CHECK-NEXT: ret void
;
call void @llvm.memset.inline.p7.i64(ptr addrspace(7) %ptr, i8 1, i64 8192, i1 false)
@@ -1576,15 +1671,9 @@ define void @memset.inline_known_i32_volatile(ptr addrspace(7) inreg %ptr) {
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
-; CHECK: [[LOADSTORELOOP]]:
-; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[TMP1]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648)
-; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32
-; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
-; CHECK: [[SPLIT]]:
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[PTR_OFF]], i32 0, i32 -2147483648)
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[PTR_OFF]], 16
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP1]], i32 0, i32 -2147483648)
; CHECK-NEXT: ret void
;
call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 32, i1 true)
@@ -1596,16 +1685,29 @@ define void @memset.inline_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %lengt
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]], i32 inreg [[LENGTH:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
-; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 0, [[LENGTH]]
-; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
-; CHECK: [[LOADSTORELOOP]]:
-; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[LENGTH]], 15
+; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[LENGTH]], [[TMP1]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP4]], 0
+; CHECK-NEXT: br i1 [[TMP12]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY:.*]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_COND:.*]]
+; CHECK: [[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[PTR_OFF]], [[TMP2]]
-; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0)
-; CHECK-NEXT: [[TMP4]] = add i32 [[TMP2]], 1
-; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[LENGTH]]
-; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
-; CHECK: [[SPLIT]]:
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> bitcast (<16 x i8> splat (i8 1) to <4 x i32>), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP5]] = add i32 [[TMP2]], 16
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT: br i1 [[TMP6]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_COND]]
+; CHECK: [[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_COND]]:
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT: br i1 [[TMP7]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_BODY:.*]], label %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION:.*]]
+; CHECK: [[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_BODY]]:
+; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_COND]] ], [ [[TMP10:%.*]], %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_BODY]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP4]], [[RESIDUAL_LOOP_INDEX]]
+; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[PTR_OFF]], [[TMP8]]
+; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP9]], i32 0, i32 0)
+; CHECK-NEXT: [[TMP10]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
+; CHECK-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP1]]
+; CHECK-NEXT: br i1 [[TMP11]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_RESIDUAL_BODY]], label %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION]]
+; CHECK: [[DYNAMIC_MEMSET_POST_LOOP_EXPANSION]]:
; CHECK-NEXT: ret void
;
call void @llvm.memset.inline.p7.i32(ptr addrspace(7) %ptr, i8 1, i32 %length, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
index cf3443ff33b72..3d73b55831cd0 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
@@ -21,15 +21,6 @@ define amdgpu_kernel void @memset_size_0(ptr addrspace(1) %dst, i8 %val) {
; OPT4-NEXT: ret void
;
; OPT0-LABEL: @memset_size_0(
-; OPT0-NEXT: br i1 true, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
-; OPT0: loadstoreloop:
-; OPT0-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
-; OPT0-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
-; OPT0-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
-; OPT0-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
-; OPT0-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 0
-; OPT0-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
-; OPT0: split:
; OPT0-NEXT: ret void
;
; OPT_NEG-LABEL: @memset_size_0(
@@ -50,15 +41,11 @@ define amdgpu_kernel void @memset_size_4(ptr addrspace(1) %dst, i8 %val) {
; OPT4-NEXT: ret void
;
; OPT0-LABEL: @memset_size_4(
-; OPT0-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
-; OPT0: loadstoreloop:
-; OPT0-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
-; OPT0-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
-; OPT0-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
-; OPT0-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
-; OPT0-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 4
-; OPT0-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
-; OPT0: split:
+; OPT0-NEXT: [[SETVALUE_SPLAT_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[VAL:%.*]], i64 0
+; OPT0-NEXT: [[SETVALUE_SPLAT_SPLAT:%.*]] = shufflevector <4 x i8> [[SETVALUE_SPLAT_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
+; OPT0-NEXT: [[SETVALUE_SPLAT_CAST:%.*]] = bitcast <4 x i8> [[SETVALUE_SPLAT_SPLAT]] to i32
+; OPT0-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
+; OPT0-NEXT: store i32 [[SETVALUE_SPLAT_CAST]], ptr addrspace(1) [[TMP1]], align 1
; OPT0-NEXT: ret void
;
; OPT_NEG-LABEL: @memset_size_4(
@@ -75,27 +62,19 @@ define amdgpu_kernel void @memset_size_8(ptr addrspace(1) %dst, i8 %val) {
; OPT8-NEXT: ret void
;
; OPT4-LABEL: @memset_size_8(
-; OPT4-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
-; OPT4: loadstoreloop:
-; OPT4-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
-; OPT4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
-; OPT4-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
-; OPT4-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
-; OPT4-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8
-; OPT4-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
-; OPT4: split:
+; OPT4-NEXT: [[SETVALUE_SPLAT_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[VAL:%.*]], i64 0
+; OPT4-NEXT: [[SETVALUE_SPLAT_SPLAT:%.*]] = shufflevector <8 x i8> [[SETVALUE_SPLAT_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
+; OPT4-NEXT: [[SETVALUE_SPLAT_CAST:%.*]] = bitcast <8 x i8> [[SETVALUE_SPLAT_SPLAT]] to i64
+; OPT4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
+; OPT4-NEXT: store i64 [[SETVALUE_SPLAT_CAST]], ptr addrspace(1) [[TMP1]], align 1
; OPT4-NEXT: ret void
;
; OPT0-LABEL: @memset_size_8(
-; OPT0-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
-; OPT0: loadstoreloop:
-; OPT0-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
-; OPT0-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
-; OPT0-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
-; OPT0-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
-; OPT0-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8
-; OPT0-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
-; OPT0: split:
+; OPT0-NEXT: [[SETVALUE_SPLAT_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[VAL:%.*]], i64 0
+; OPT0-NEXT: [[SETVALUE_SPLAT_SPLAT:%.*]] = shufflevector <8 x i8> [[SETVALUE_SPLAT_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
+; OPT0-NEXT: [[SETVALUE_SPLAT_CAST:%.*]] = bitcast <8 x i8> [[SETVALUE_SPLAT_SPLAT]] to i64
+; OPT0-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
+; OPT0-NEXT: store i64 [[SETVALUE_SPLAT_CAST]], ptr addrspace(1) [[TMP1]], align 1
; OPT0-NEXT: ret void
;
; OPT_NEG-LABEL: @memset_size_8(
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index 20a34dc997bbc..929ef2fc3c06c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -163,15 +163,18 @@ define amdgpu_kernel void @max_size_small_static_memset_caller0(ptr addrspace(1)
; MAX1024-NEXT: ret void
;
; ALL-LABEL: @max_size_small_static_memset_caller0(
-; ALL-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
-; ALL: loadstoreloop:
+; ALL-NEXT: [[SETVALUE_SPLAT_SPLATINSERT:%.*]] = insertelement <256 x i8> poison, i8 [[VAL:%.*]], i64 0
+; ALL-NEXT: [[SETVALUE_SPLAT_SPLAT:%.*]] = shufflevector <256 x i8> [[SETVALUE_SPLAT_SPLATINSERT]], <256 x i8> poison, <256 x i32> zeroinitializer
+; ALL-NEXT: [[SETVALUE_SPLAT_CAST:%.*]] = bitcast <256 x i8> [[SETVALUE_SPLAT_SPLAT]] to <64 x i32>
+; ALL-NEXT: br label [[LOADSTORELOOP:%.*]]
+; ALL: static-memset-expansion-main-body:
; ALL-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
; ALL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
-; ALL-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
-; ALL-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
+; ALL-NEXT: store <64 x i32> [[SETVALUE_SPLAT_CAST]], ptr addrspace(1) [[TMP2]], align 1
+; ALL-NEXT: [[TMP3]] = add i64 [[TMP1]], 256
; ALL-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1024
-; ALL-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
-; ALL: split:
+; ALL-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT:%.*]]
+; ALL: static-memset-post-expansion:
; ALL-NEXT: ret void
;
call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 1024, i1 false)
@@ -180,21 +183,60 @@ define amdgpu_kernel void @max_size_small_static_memset_caller0(ptr addrspace(1)
define amdgpu_kernel void @min_size_large_static_memset_caller0(ptr addrspace(1) %dst, i8 %val) #0 {
; OPT-LABEL: @min_size_large_static_memset_caller0(
-; OPT-NEXT: br i1 false, label [[SPLIT:%.*]], label [[LOADSTORELOOP:%.*]]
-; OPT: loadstoreloop:
+; OPT-NEXT: [[SETVALUE_SPLAT_SPLATINSERT:%.*]] = insertelement <256 x i8> poison, i8 [[VAL:%.*]], i64 0
+; OPT-NEXT: [[SETVALUE_SPLAT_SPLAT:%.*]] = shufflevector <256 x i8> [[SETVALUE_SPLAT_SPLATINSERT]], <256 x i8> poison, <256 x i32> zeroinitializer
+; OPT-NEXT: [[SETVALUE_SPLAT_CAST:%.*]] = bitcast <256 x i8> [[SETVALUE_SPLAT_SPLAT]] to <64 x i32>
+; OPT-NEXT: br label [[LOADSTORELOOP:%.*]]
+; OPT: static-memset-expansion-main-body:
; OPT-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[TMP1]]
-; OPT-NEXT: store i8 [[VAL:%.*]], ptr addrspace(1) [[TMP2]], align 1
-; OPT-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
-; OPT-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1025
-; OPT-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
-; OPT: split:
+; OPT-NEXT: store <64 x i32> [[SETVALUE_SPLAT_CAST]], ptr addrspace(1) [[TMP2]], align 1
+; OPT-NEXT: [[TMP3]] = add i64 [[TMP1]], 256
+; OPT-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1024
+; OPT-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT:%.*]]
+; OPT: static-memset-post-expansion:
+; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 1024
+; OPT-NEXT: store i8 [[VAL]], ptr addrspace(1) [[TMP5]], align 1
; OPT-NEXT: ret void
;
call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 1025, i1 false)
ret void
}
+define amdgpu_kernel void @variable_size_memset_caller0(ptr addrspace(1) %dst, i8 %val, i64 %n) #0 {
+; OPT-LABEL: @variable_size_memset_caller0(
+; OPT-NEXT: [[SETVALUE_SPLAT_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[VAL:%.*]], i64 0
+; OPT-NEXT: [[SETVALUE_SPLAT_SPLAT:%.*]] = shufflevector <16 x i8> [[SETVALUE_SPLAT_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+; OPT-NEXT: [[SETVALUE_SPLAT_CAST:%.*]] = bitcast <16 x i8> [[SETVALUE_SPLAT_SPLAT]] to <4 x i32>
+; OPT-NEXT: [[TMP1:%.*]] = and i64 [[N:%.*]], 15
+; OPT-NEXT: [[TMP2:%.*]] = sub i64 [[N]], [[TMP1]]
+; OPT-NEXT: [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 0
+; OPT-NEXT: br i1 [[TMP3]], label [[DYNAMIC_MEMSET_EXPANSION_MAIN_BODY:%.*]], label [[DYNAMIC_MEMSET_EXPANSION_RESIDUAL_COND:%.*]]
+; OPT: dynamic-memset-expansion-main-body:
+; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[DYNAMIC_MEMSET_EXPANSION_MAIN_BODY]] ]
+; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 [[LOOP_INDEX]]
+; OPT-NEXT: store <4 x i32> [[SETVALUE_SPLAT_CAST]], ptr addrspace(1) [[TMP4]], align 1
+; OPT-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX]], 16
+; OPT-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[TMP2]]
+; OPT-NEXT: br i1 [[TMP6]], label [[DYNAMIC_MEMSET_EXPANSION_MAIN_BODY]], label [[DYNAMIC_MEMSET_EXPANSION_RESIDUAL_COND]]
+; OPT: dynamic-memset-expansion-residual-cond:
+; OPT-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP1]], 0
+; OPT-NEXT: br i1 [[TMP7]], label [[DYNAMIC_MEMSET_EXPANSION_RESIDUAL_BODY:%.*]], label [[DYNAMIC_MEMSET_POST_EXPANSION:%.*]]
+; OPT: dynamic-memset-expansion-residual-body:
+; OPT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[DYNAMIC_MEMSET_EXPANSION_RESIDUAL_COND]] ], [ [[TMP10:%.*]], [[DYNAMIC_MEMSET_EXPANSION_RESIDUAL_BODY]] ]
+; OPT-NEXT: [[TMP8:%.*]] = add i64 [[TMP2]], [[RESIDUAL_LOOP_INDEX]]
+; OPT-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[TMP8]]
+; OPT-NEXT: store i8 [[VAL]], ptr addrspace(1) [[TMP9]], align 1
+; OPT-NEXT: [[TMP10]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
+; OPT-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP1]]
+; OPT-NEXT: br i1 [[TMP11]], label [[DYNAMIC_MEMSET_EXPANSION_RESIDUAL_BODY]], label [[DYNAMIC_MEMSET_POST_EXPANSION]]
+; OPT: dynamic-memset-post-expansion:
+; OPT-NEXT: ret void
+;
+ call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 %n, i1 false)
+ ret void
+}
+
define amdgpu_kernel void @variable_memcpy_caller0(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 %n) #0 {
; OPT-LABEL: @variable_memcpy_caller0(
; OPT-NEXT: [[TMP2:%.*]] = and i64 [[N:%.*]], 15
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index 4f2816538b1ff..c60642e2cc4d8 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -15974,6 +15974,1616 @@ entry:
ret void
}
+define void @memset_p0_sz2048(ptr addrspace(0) %dst) {
+; CHECK-LABEL: memset_p0_sz2048:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s4, 0x41414141
+; CHECK-NEXT: s_mov_b32 s5, s4
+; CHECK-NEXT: s_mov_b32 s6, s4
+; CHECK-NEXT: s_mov_b32 s7, s4
+; CHECK-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-NEXT: v_mov_b32_e32 v3, s5
+; CHECK-NEXT: v_mov_b32_e32 v4, s6
+; CHECK-NEXT: v_mov_b32_e32 v5, s7
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_inst_prefetch 0x1
+; CHECK-NEXT: .p2align 6
+; CHECK-NEXT: .LBB10_1: ; %static-memset-expansion-main-body
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v0, s4
+; CHECK-NEXT: s_add_u32 s4, s4, 0x100
+; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, s5, v1, vcc_lo
+; CHECK-NEXT: s_addc_u32 s5, s5, 0
+; CHECK-NEXT: v_add_co_u32 v8, vcc_lo, v6, 48
+; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; CHECK-NEXT: v_add_co_ci_u32_e64 v9, null, 0, v7, vcc_lo
+; CHECK-NEXT: flat_store_dwordx4 v[6:7], v[2:5] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[6:7], v[2:5] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[6:7], v[2:5] offset:32
+; CHECK-NEXT: flat_store_dwordx4 v[6:7], v[2:5] offset:16
+; CHECK-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:192
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:176
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:160
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:144
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:128
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:112
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:96
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:64
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:48
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5] offset:32
+; CHECK-NEXT: flat_store_dwordx4 v[8:9], v[2:5]
+; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6
+; CHECK-NEXT: s_cbranch_vccnz .LBB10_1
+; CHECK-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; CHECK-NEXT: s_inst_prefetch 0x2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memset_p0_sz2048:
+; ALIGNED: ; %bb.0: ; %entry
+; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT: v_mov_b32_e32 v4, 0x41414141
+; ALIGNED-NEXT: v_mov_b32_e32 v5, 0x41
+; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT: .LBB10_1: ; %static-memset-expansion-main-body
+; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v0, s4
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v1, vcc_lo
+; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:128
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:64
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:32
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:16
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:8
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:4
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:2
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:1
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5
+; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v2, 3
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:247
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:248
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:246
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:252
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:251
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:250
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:249
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:245
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:244
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:243
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:242
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:241
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:240
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:239
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:238
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:237
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:231
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:232
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:230
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:236
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:235
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:234
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:233
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:229
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:228
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:227
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:226
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:225
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:224
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:223
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:222
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:221
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:215
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:216
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:214
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:220
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:219
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:218
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:217
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:213
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:212
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:211
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:210
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:209
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:208
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:207
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:206
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:205
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:199
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:200
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:198
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:204
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:203
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:202
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:201
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:197
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:196
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:195
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:194
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:193
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:192
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:191
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:190
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:189
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:183
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:184
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:182
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:188
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:187
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:186
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:185
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:181
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:180
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:179
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:178
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:177
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:176
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:175
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:174
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:173
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:167
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:168
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:166
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:172
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:171
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:170
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:169
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:165
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:164
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:163
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:162
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:161
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:160
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:159
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:158
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:157
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:151
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:152
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:150
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:156
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:155
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:154
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:153
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:149
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:148
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:147
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:146
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:145
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:144
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:143
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:142
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:141
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:135
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:136
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:134
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:140
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:139
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:138
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:137
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:133
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:132
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:131
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:130
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:129
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:128
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:127
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:126
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:119
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:120
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:118
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:124
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:123
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:122
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:121
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:117
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:116
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:115
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:114
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:113
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:112
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:111
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:110
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:109
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:44
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:103
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:104
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:102
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:108
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:107
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:106
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:105
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:101
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:100
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:99
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:98
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:97
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:96
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:95
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:94
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:93
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:78
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:77
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:80
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:79
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:84
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:83
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:82
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:81
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:86
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:85
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:88
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:87
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:92
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:91
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:90
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:89
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:71
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:72
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:70
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:76
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:75
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:74
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:73
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:69
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:68
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:67
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:66
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:65
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:64
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:63
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:62
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:55
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:56
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:54
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:60
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:59
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:58
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:57
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:53
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:52
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:51
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:50
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:49
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:48
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:47
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:46
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:45
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:39
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:40
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:38
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:44
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:43
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:42
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:41
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:37
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:36
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:35
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:34
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:33
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:32
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:31
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:30
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:23
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:24
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:22
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:28
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:27
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:26
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:25
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:21
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:20
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:19
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:18
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:17
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:16
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:15
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:14
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:7
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:8
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:6
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:12
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:11
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:10
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:9
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:4
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:3
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5 offset:2
+; ALIGNED-NEXT: flat_store_byte v[2:3], v5
+; ALIGNED-NEXT: s_cbranch_vccnz .LBB10_1
+; ALIGNED-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
+; ALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memset_p0_sz2048:
+; UNROLL3: ; %bb.0: ; %entry
+; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT: s_mov_b32 s4, 0x41414141
+; UNROLL3-NEXT: s_mov_b32 s5, s4
+; UNROLL3-NEXT: s_mov_b32 s6, s4
+; UNROLL3-NEXT: s_mov_b32 s7, s4
+; UNROLL3-NEXT: v_mov_b32_e32 v2, s4
+; UNROLL3-NEXT: v_mov_b32_e32 v3, s5
+; UNROLL3-NEXT: v_mov_b32_e32 v4, s6
+; UNROLL3-NEXT: v_mov_b32_e32 v5, s7
+; UNROLL3-NEXT: s_mov_b64 s[4:5], 0
+; UNROLL3-NEXT: .p2align 6
+; UNROLL3-NEXT: .LBB10_1: ; %static-memset-expansion-main-body
+; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT: v_add_co_u32 v6, vcc_lo, v0, s4
+; UNROLL3-NEXT: s_add_u32 s4, s4, 48
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v7, null, s5, v1, vcc_lo
+; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[2:5] offset:16
+; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
+; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5]
+; UNROLL3-NEXT: flat_store_dwordx4 v[6:7], v[2:5] offset:32
+; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6
+; UNROLL3-NEXT: s_cbranch_vccnz .LBB10_1
+; UNROLL3-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; UNROLL3-NEXT: s_mov_b32 s4, 0x41414141
+; UNROLL3-NEXT: s_mov_b32 s5, s4
+; UNROLL3-NEXT: s_mov_b32 s6, s4
+; UNROLL3-NEXT: s_mov_b32 s7, s4
+; UNROLL3-NEXT: v_mov_b32_e32 v2, s4
+; UNROLL3-NEXT: v_mov_b32_e32 v3, s5
+; UNROLL3-NEXT: v_mov_b32_e32 v4, s6
+; UNROLL3-NEXT: v_mov_b32_e32 v5, s7
+; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:2016
+; UNROLL3-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:2032
+; UNROLL3-NEXT: s_waitcnt lgkmcnt(0)
+; UNROLL3-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p0.i64(ptr addrspace(0) noundef nonnull %dst, i8 65, i64 2048, i1 false)
+ ret void
+}
+
+define void @memset_p1_sz2048(ptr addrspace(1) %dst) {
+; CHECK-LABEL: memset_p1_sz2048:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s4, 0x41414141
+; CHECK-NEXT: s_mov_b32 s5, s4
+; CHECK-NEXT: s_mov_b32 s6, s4
+; CHECK-NEXT: s_mov_b32 s7, s4
+; CHECK-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-NEXT: v_mov_b32_e32 v3, s5
+; CHECK-NEXT: v_mov_b32_e32 v4, s6
+; CHECK-NEXT: v_mov_b32_e32 v5, s7
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_inst_prefetch 0x1
+; CHECK-NEXT: .p2align 6
+; CHECK-NEXT: .LBB11_1: ; %static-memset-expansion-main-body
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: v_add_co_u32 v6, vcc_lo, v0, s4
+; CHECK-NEXT: s_add_u32 s4, s4, 0x100
+; CHECK-NEXT: v_add_co_ci_u32_e64 v7, null, s5, v1, vcc_lo
+; CHECK-NEXT: s_addc_u32 s5, s5, 0
+; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:240
+; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:224
+; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:208
+; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:192
+; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:176
+; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:160
+; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:144
+; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:128
+; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:112
+; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:96
+; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:80
+; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:64
+; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:48
+; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:32
+; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:16
+; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; CHECK-NEXT: global_store_dwordx4 v[6:7], v[2:5], off
+; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6
+; CHECK-NEXT: s_cbranch_vccnz .LBB11_1
+; CHECK-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; CHECK-NEXT: s_inst_prefetch 0x2
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memset_p1_sz2048:
+; ALIGNED: ; %bb.0: ; %entry
+; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT: v_mov_b32_e32 v4, 0x41414141
+; ALIGNED-NEXT: v_mov_b32_e32 v5, 0x41
+; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT: .LBB11_1: ; %static-memset-expansion-main-body
+; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT: v_add_co_u32 v2, vcc_lo, v0, s4
+; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT: v_add_co_ci_u32_e64 v3, null, s5, v1, vcc_lo
+; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:156
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:148
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:144
+; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:250
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:251
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:249
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:255
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:254
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:253
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:252
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:248
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:247
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:246
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:245
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:244
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:243
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:242
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:241
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:240
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:168
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:172
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:164
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:160
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:234
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:235
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:233
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:239
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:238
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:237
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:236
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:232
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:231
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:230
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:229
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:228
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:227
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:226
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:225
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:224
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:120
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:124
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:218
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:219
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:217
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:223
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:222
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:221
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:220
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:216
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:215
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:214
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:213
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:212
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:211
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:210
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:209
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:208
+; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:140
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:202
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:203
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:201
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:207
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:206
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:205
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:204
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:200
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:199
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:198
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:197
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:196
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:195
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:194
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:193
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:192
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:216
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:220
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:212
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:208
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:186
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:187
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:185
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:191
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:190
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:189
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:188
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:184
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:183
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:182
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:181
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:180
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:179
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:178
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:177
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:176
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:232
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:236
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:224
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:170
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:171
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:169
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:175
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:174
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:173
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:172
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:168
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:167
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:166
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:165
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:164
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:163
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:162
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:161
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:160
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:184
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:188
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:176
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:154
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:155
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:153
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:159
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:158
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:157
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:156
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:152
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:151
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:150
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:149
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:148
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:147
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:146
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:145
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:144
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:204
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:192
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:138
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:139
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:137
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:143
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:142
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:141
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:140
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:136
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:135
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:134
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:133
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:132
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:131
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:130
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:129
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:128
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:122
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:123
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:121
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:127
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:126
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:125
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:124
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:120
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:119
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:118
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:117
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:116
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:115
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:114
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:113
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:112
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:40
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:44
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:32
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:106
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:107
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:105
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:111
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:110
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:109
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:108
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:104
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:103
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:102
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:101
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:100
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:99
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:98
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:97
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:96
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:252
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:240
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:81
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:80
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:83
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:82
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:87
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:86
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:85
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:84
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:89
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:88
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:91
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:90
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:95
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:94
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:93
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:92
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:74
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:75
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:73
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:79
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:78
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:77
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:76
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:72
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:71
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:70
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:69
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:68
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:67
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:66
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:65
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:64
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:58
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:59
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:57
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:63
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:62
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:61
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:60
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:56
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:55
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:54
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:53
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:52
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:51
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:50
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:49
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:48
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:104
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:108
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:42
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:43
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:41
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:47
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:46
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:45
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:44
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:40
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:39
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:38
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:37
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:36
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:35
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:34
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:33
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:32
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:52
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:26
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:27
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:25
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:31
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:30
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:29
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:28
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:24
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:23
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:22
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:21
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:20
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:19
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:18
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:17
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:16
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:72
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68
+; ALIGNED-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:64
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:10
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:11
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:9
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:15
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:14
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:13
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:12
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:8
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:7
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:6
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:5
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:4
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:3
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:2
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off offset:1
+; ALIGNED-NEXT: global_store_byte v[2:3], v5, off
+; ALIGNED-NEXT: s_cbranch_vccnz .LBB11_1
+; ALIGNED-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; ALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memset_p1_sz2048:
+; UNROLL3: ; %bb.0: ; %entry
+; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT: s_mov_b32 s4, 0x41414141
+; UNROLL3-NEXT: s_mov_b32 s5, s4
+; UNROLL3-NEXT: s_mov_b32 s6, s4
+; UNROLL3-NEXT: s_mov_b32 s7, s4
+; UNROLL3-NEXT: v_mov_b32_e32 v2, s4
+; UNROLL3-NEXT: v_mov_b32_e32 v3, s5
+; UNROLL3-NEXT: v_mov_b32_e32 v4, s6
+; UNROLL3-NEXT: v_mov_b32_e32 v5, s7
+; UNROLL3-NEXT: s_mov_b64 s[4:5], 0
+; UNROLL3-NEXT: .p2align 6
+; UNROLL3-NEXT: .LBB11_1: ; %static-memset-expansion-main-body
+; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT: v_add_co_u32 v6, vcc_lo, v0, s4
+; UNROLL3-NEXT: s_add_u32 s4, s4, 48
+; UNROLL3-NEXT: v_add_co_ci_u32_e64 v7, null, s5, v1, vcc_lo
+; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:16
+; UNROLL3-NEXT: global_store_dwordx4 v[6:7], v[2:5], off
+; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5]
+; UNROLL3-NEXT: global_store_dwordx4 v[6:7], v[2:5], off offset:32
+; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6
+; UNROLL3-NEXT: s_cbranch_vccnz .LBB11_1
+; UNROLL3-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; UNROLL3-NEXT: s_mov_b32 s4, 0x41414141
+; UNROLL3-NEXT: s_mov_b32 s5, s4
+; UNROLL3-NEXT: s_mov_b32 s6, s4
+; UNROLL3-NEXT: s_mov_b32 s7, s4
+; UNROLL3-NEXT: v_mov_b32_e32 v2, s4
+; UNROLL3-NEXT: v_mov_b32_e32 v3, s5
+; UNROLL3-NEXT: v_mov_b32_e32 v4, s6
+; UNROLL3-NEXT: v_mov_b32_e32 v5, s7
+; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:2016
+; UNROLL3-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:2032
+; UNROLL3-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef nonnull %dst, i8 65, i64 2048, i1 false)
+ ret void
+}
+
+define void @memset_p3_sz2048(ptr addrspace(3) %dst) {
+; CHECK-LABEL: memset_p3_sz2048:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s4, 0x41414141
+; CHECK-NEXT: s_mov_b32 s5, s4
+; CHECK-NEXT: s_mov_b32 s6, s4
+; CHECK-NEXT: s_mov_b32 s7, s4
+; CHECK-NEXT: v_mov_b32_e32 v1, s4
+; CHECK-NEXT: v_mov_b32_e32 v2, s5
+; CHECK-NEXT: v_mov_b32_e32 v3, s6
+; CHECK-NEXT: v_mov_b32_e32 v4, s7
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_inst_prefetch 0x1
+; CHECK-NEXT: .p2align 6
+; CHECK-NEXT: .LBB12_1: ; %static-memset-expansion-main-body
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_add_u32 s4, s4, 0x100
+; CHECK-NEXT: s_addc_u32 s5, s5, 0
+; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:240
+; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:224
+; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:208
+; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:192
+; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:176
+; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:160
+; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:144
+; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:128
+; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:112
+; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:96
+; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:80
+; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:64
+; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:48
+; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:32
+; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:16
+; CHECK-NEXT: ds_write_b128 v0, v[1:4]
+; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x100, v0
+; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6
+; CHECK-NEXT: s_cbranch_vccnz .LBB12_1
+; CHECK-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; CHECK-NEXT: s_inst_prefetch 0x2
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memset_p3_sz2048:
+; ALIGNED: ; %bb.0: ; %entry
+; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT: v_mov_b32_e32 v1, 0x41
+; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT: .LBB12_1: ; %static-memset-expansion-main-body
+; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:255
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:254
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:253
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:252
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:251
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:250
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:249
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:248
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:247
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:246
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:245
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:244
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:243
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:242
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:241
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:240
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:239
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:238
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:237
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:236
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:235
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:234
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:233
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:232
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:231
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:230
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:229
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:228
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:227
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:226
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:225
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:224
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:223
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:222
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:221
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:220
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:219
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:218
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:217
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:216
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:215
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:214
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:213
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:212
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:211
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:210
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:209
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:208
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:207
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:206
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:205
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:204
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:203
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:202
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:201
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:200
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:199
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:198
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:197
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:196
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:195
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:194
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:193
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:192
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:191
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:190
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:189
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:188
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:187
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:186
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:185
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:184
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:183
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:182
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:181
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:180
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:179
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:178
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:177
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:176
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:175
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:174
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:173
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:172
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:171
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:170
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:169
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:168
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:167
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:166
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:165
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:164
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:163
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:162
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:161
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:160
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:159
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:158
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:157
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:156
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:155
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:154
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:153
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:152
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:151
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:150
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:149
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:148
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:147
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:146
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:145
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:144
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:143
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:142
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:141
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:140
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:139
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:138
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:137
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:136
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:135
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:134
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:133
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:132
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:131
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:130
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:129
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:128
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:127
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:126
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:125
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:124
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:123
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:122
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:121
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:120
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:119
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:118
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:117
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:116
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:115
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:114
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:113
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:112
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:111
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:110
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:109
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:108
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:107
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:106
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:105
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:104
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:103
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:102
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:101
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:100
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:99
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:98
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:97
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:96
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:87
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:86
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:85
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:84
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:81
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:80
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:83
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:82
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:95
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:94
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:93
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:92
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:89
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:88
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:91
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:90
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:79
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:78
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:77
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:76
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:75
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:74
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:73
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:72
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:71
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:70
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:69
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:68
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:67
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:66
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:65
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:64
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:63
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:62
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:61
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:60
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:59
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:58
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:57
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:56
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:55
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:54
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:53
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:52
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:51
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:50
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:49
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:48
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:47
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:46
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:45
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:44
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:43
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:42
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:41
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:40
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:39
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:38
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:37
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:36
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:35
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:34
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:33
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:32
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:31
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:30
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:29
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:28
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:27
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:26
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:25
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:24
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:23
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:22
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:21
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:20
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:19
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:18
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:17
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:16
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:15
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:14
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:13
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:12
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:11
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:10
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:9
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:8
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:7
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:6
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:5
+; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:4
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:3
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:2
+; ALIGNED-NEXT: ds_write_b8 v0, v1 offset:1
+; ALIGNED-NEXT: ds_write_b8 v0, v1
+; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x100, v0
+; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6
+; ALIGNED-NEXT: s_cbranch_vccnz .LBB12_1
+; ALIGNED-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
+; ALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memset_p3_sz2048:
+; UNROLL3: ; %bb.0: ; %entry
+; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT: s_mov_b32 s4, 0x41414141
+; UNROLL3-NEXT: v_mov_b32_e32 v5, v0
+; UNROLL3-NEXT: s_mov_b32 s5, s4
+; UNROLL3-NEXT: s_mov_b32 s6, s4
+; UNROLL3-NEXT: s_mov_b32 s7, s4
+; UNROLL3-NEXT: v_mov_b32_e32 v1, s4
+; UNROLL3-NEXT: v_mov_b32_e32 v2, s5
+; UNROLL3-NEXT: v_mov_b32_e32 v3, s6
+; UNROLL3-NEXT: v_mov_b32_e32 v4, s7
+; UNROLL3-NEXT: s_mov_b64 s[4:5], 0
+; UNROLL3-NEXT: .LBB12_1: ; %static-memset-expansion-main-body
+; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT: s_add_u32 s4, s4, 48
+; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT: ds_write_b128 v5, v[1:4] offset:16
+; UNROLL3-NEXT: ds_write_b128 v5, v[1:4]
+; UNROLL3-NEXT: ds_write_b128 v5, v[1:4] offset:32
+; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5]
+; UNROLL3-NEXT: v_add_nc_u32_e32 v5, 48, v5
+; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6
+; UNROLL3-NEXT: s_cbranch_vccnz .LBB12_1
+; UNROLL3-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; UNROLL3-NEXT: s_mov_b32 s4, 0x41414141
+; UNROLL3-NEXT: s_mov_b32 s5, s4
+; UNROLL3-NEXT: s_mov_b32 s6, s4
+; UNROLL3-NEXT: s_mov_b32 s7, s4
+; UNROLL3-NEXT: v_mov_b32_e32 v1, s4
+; UNROLL3-NEXT: v_mov_b32_e32 v2, s5
+; UNROLL3-NEXT: v_mov_b32_e32 v3, s6
+; UNROLL3-NEXT: v_mov_b32_e32 v4, s7
+; UNROLL3-NEXT: ds_write_b128 v0, v[1:4] offset:2016
+; UNROLL3-NEXT: ds_write_b128 v0, v[1:4] offset:2032
+; UNROLL3-NEXT: s_waitcnt lgkmcnt(0)
+; UNROLL3-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p3.i64(ptr addrspace(3) noundef nonnull %dst, i8 65, i64 2048, i1 false)
+ ret void
+}
+
+define void @memset_p5_sz2048(ptr addrspace(5) %dst) {
+; CHECK-LABEL: memset_p5_sz2048:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v1, 0x41414141
+; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: .LBB13_1: ; %static-memset-expansion-main-body
+; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: s_add_u32 s4, s4, 0x100
+; CHECK-NEXT: s_addc_u32 s5, s5, 0
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:252
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:248
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:244
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:240
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:236
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:232
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:228
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:224
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:220
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:216
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:212
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:208
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:204
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:200
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:196
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:192
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:188
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:184
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:180
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:176
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:172
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:168
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:164
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:160
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:156
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:152
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:148
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:144
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:140
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:136
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:132
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:128
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:124
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:120
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:116
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:112
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:108
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:104
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:100
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:96
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:92
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:88
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:84
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:80
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:76
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:72
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:68
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:64
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:60
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:56
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:52
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:48
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:44
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:40
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:36
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:32
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:20
+; CHECK-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; CHECK-NEXT: v_add_nc_u32_e32 v0, 0x100, v0
+; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s6
+; CHECK-NEXT: s_cbranch_vccnz .LBB13_1
+; CHECK-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+;
+; ALIGNED-LABEL: memset_p5_sz2048:
+; ALIGNED: ; %bb.0: ; %entry
+; ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ALIGNED-NEXT: v_mov_b32_e32 v1, 0x41
+; ALIGNED-NEXT: s_mov_b64 s[4:5], 0
+; ALIGNED-NEXT: .LBB13_1: ; %static-memset-expansion-main-body
+; ALIGNED-NEXT: ; =>This Inner Loop Header: Depth=1
+; ALIGNED-NEXT: s_add_u32 s4, s4, 0x100
+; ALIGNED-NEXT: s_addc_u32 s5, s5, 0
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:255
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:254
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:253
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:252
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:251
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:250
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:249
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:248
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:247
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:246
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:245
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:244
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:243
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:242
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:241
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:240
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:239
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:238
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:237
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:236
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:235
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:234
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:233
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:232
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:231
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:230
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:229
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:228
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:227
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:226
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:225
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:224
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:223
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:222
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:221
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:220
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:219
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:218
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:217
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:216
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:215
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:214
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:213
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:212
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:211
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:210
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:209
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:208
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:207
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:206
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:205
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:204
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:203
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:202
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:201
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:200
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:199
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:198
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:197
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:196
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:195
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:194
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:193
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:192
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:191
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:190
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:189
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:188
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:187
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:186
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:185
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:184
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:183
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:182
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:181
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:180
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:179
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:178
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:177
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:176
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:175
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:174
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:173
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:172
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:171
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:170
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:169
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:168
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:167
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:166
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:165
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:164
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:163
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:162
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:161
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:160
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:159
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:158
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:157
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:156
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:155
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:154
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:153
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:152
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:151
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:150
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:149
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:148
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:147
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:146
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:145
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:144
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:143
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:142
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:141
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:140
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:139
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:138
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:137
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:136
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:135
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:134
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:133
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:132
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:131
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:130
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:129
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:128
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:127
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:126
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:125
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:124
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:123
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:122
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:121
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:120
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:119
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:118
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:117
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:116
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:115
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:114
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:113
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:112
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:111
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:110
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:109
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:108
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:107
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:106
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:105
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:104
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:103
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:102
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:101
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:100
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:99
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:98
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:97
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:96
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:95
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:94
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:93
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:92
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:91
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:90
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:89
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:88
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:87
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:86
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:85
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:84
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:83
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:82
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:81
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:80
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:79
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:78
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:77
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:76
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:75
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:74
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:73
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:72
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:71
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:70
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:69
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:68
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:67
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:66
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:65
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:64
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:63
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:62
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:61
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:60
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:59
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:58
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:57
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:56
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:55
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:54
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:53
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:52
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:51
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:50
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:49
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:48
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:47
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:46
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:45
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:44
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:43
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:42
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:41
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:40
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:39
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:38
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:37
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:36
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:35
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:34
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:33
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:32
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:31
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:29
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:28
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:27
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:26
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:25
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:24
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:23
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:22
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:21
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:20
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:19
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:17
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:14
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:13
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:12
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:11
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:10
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:9
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:8
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:7
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:6
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:5
+; ALIGNED-NEXT: v_cmp_gt_u64_e64 s6, 0x800, s[4:5]
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:4
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:3
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:2
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:1
+; ALIGNED-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen
+; ALIGNED-NEXT: v_add_nc_u32_e32 v0, 0x100, v0
+; ALIGNED-NEXT: s_and_b32 vcc_lo, exec_lo, s6
+; ALIGNED-NEXT: s_cbranch_vccnz .LBB13_1
+; ALIGNED-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; ALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; UNROLL3-LABEL: memset_p5_sz2048:
+; UNROLL3: ; %bb.0: ; %entry
+; UNROLL3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; UNROLL3-NEXT: v_mov_b32_e32 v1, 0x41414141
+; UNROLL3-NEXT: v_mov_b32_e32 v2, v0
+; UNROLL3-NEXT: s_mov_b64 s[4:5], 0
+; UNROLL3-NEXT: .p2align 6
+; UNROLL3-NEXT: .LBB13_1: ; %static-memset-expansion-main-body
+; UNROLL3-NEXT: ; =>This Inner Loop Header: Depth=1
+; UNROLL3-NEXT: s_add_u32 s4, s4, 48
+; UNROLL3-NEXT: s_addc_u32 s5, s5, 0
+; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:44
+; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:40
+; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:36
+; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:32
+; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:28
+; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:24
+; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:20
+; UNROLL3-NEXT: v_cmp_gt_u64_e64 s6, 0x7e0, s[4:5]
+; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:16
+; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:12
+; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:8
+; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; UNROLL3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; UNROLL3-NEXT: v_add_nc_u32_e32 v2, 48, v2
+; UNROLL3-NEXT: s_and_b32 vcc_lo, exec_lo, s6
+; UNROLL3-NEXT: s_cbranch_vccnz .LBB13_1
+; UNROLL3-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; UNROLL3-NEXT: v_mov_b32_e32 v1, 0x41414141
+; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2028
+; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2024
+; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2020
+; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2016
+; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2044
+; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2040
+; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2036
+; UNROLL3-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:2032
+; UNROLL3-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p5.i64(ptr addrspace(5) noundef nonnull %dst, i8 65, i64 2048, i1 false)
+ ret void
+}
+
declare void @llvm.memcpy.p0.p0.i64(ptr addrspace(0) noalias nocapture writeonly, ptr addrspace(0) noalias nocapture readonly, i64, i1 immarg) #2
declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2
@@ -15989,4 +17599,10 @@ declare void @llvm.memmove.p5.p5.i64(ptr addrspace(5) nocapture writeonly, ptr a
declare void @llvm.memmove.p0.p5.i64(ptr addrspace(0) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) #2
+declare void @llvm.memset.p0.i64(ptr addrspace(0) nocapture writeonly, i8, i64, i1 immarg) #3
+declare void @llvm.memset.p1.i64(ptr addrspace(1) nocapture writeonly, i8, i64, i1 immarg) #3
+declare void @llvm.memset.p3.i64(ptr addrspace(3) nocapture writeonly, i8, i64, i1 immarg) #3
+declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg) #3
+
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+attributes #3 = { nocallback nofree nounwind willreturn memory(argmem: write) }
diff --git a/llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll
new file mode 100644
index 0000000000000..512c0477803fb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll
@@ -0,0 +1,1900 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 %s -o - | FileCheck -check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 %s -o - | FileCheck -check-prefixes=GFX942,GFX942-GISEL %s
+
+
+define void @memset_p0_varsize_align_4_varsetval(ptr addrspace(0) align 4 %dst, i8 %setval, i64 %size) {
+; GFX942-SDAG-LABEL: memset_p0_varsize_align_4_varsetval:
+; GFX942-SDAG: ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v3
+; GFX942-SDAG-NEXT: v_and_b32_e32 v12, -16, v10
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v4
+; GFX942-SDAG-NEXT: v_and_b32_e32 v8, 15, v10
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13]
+; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT: s_cbranch_execz .LBB0_3
+; GFX942-SDAG-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader
+; GFX942-SDAG-NEXT: s_mov_b32 s4, 0x4040404
+; GFX942-SDAG-NEXT: v_perm_b32 v4, v2, v2, s4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT: .LBB0_2: ; %dynamic-memset-expansion-main-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[14:15], v[0:1], 0, s[4:5]
+; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 16
+; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[12:13]
+; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
+; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB0_2
+; GFX942-SDAG-NEXT: .LBB0_3: ; %Flow4
+; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT: s_cbranch_execz .LBB0_6
+; GFX942-SDAG-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader
+; GFX942-SDAG-NEXT: v_and_b32_e32 v10, -16, v10
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[10:11]
+; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT: .LBB0_5: ; %dynamic-memset-expansion-residual-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[4:5]
+; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 1
+; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
+; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT: flat_store_byte v[4:5], v2
+; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB0_5
+; GFX942-SDAG-NEXT: .LBB0_6: ; %Flow2
+; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_p0_varsize_align_4_varsetval:
+; GFX942-GISEL: ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v10, v3
+; GFX942-GISEL-NEXT: v_and_b32_e32 v8, 15, v10
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v11, v4
+; GFX942-GISEL-NEXT: v_sub_co_u32_e32 v12, vcc, v10, v8
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_subbrev_co_u32_e32 v13, vcc, 0, v11, vcc
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13]
+; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT: s_cbranch_execz .LBB0_3
+; GFX942-GISEL-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader
+; GFX942-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v2
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v4, v3, 8, v3
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX942-GISEL-NEXT: v_or3_b32 v4, v4, v5, v3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
+; GFX942-GISEL-NEXT: .LBB0_2: ; %dynamic-memset-expansion-main-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, v0, v14
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, v1, v15, vcc
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 16, v14
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[16:17], v[4:7]
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc
+; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[14:15], v[12:13]
+; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB0_2
+; GFX942-GISEL-NEXT: .LBB0_3: ; %Flow4
+; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT: s_cbranch_execz .LBB0_6
+; GFX942-GISEL-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader
+; GFX942-GISEL-NEXT: v_lshrrev_b64 v[4:5], 4, v[10:11]
+; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 4, v[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT: .LBB0_5: ; %dynamic-memset-expansion-residual-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, v0, v4
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v5, vcc
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v4, vcc, 1, v4
+; GFX942-GISEL-NEXT: flat_store_byte v[6:7], v2
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[8:9]
+; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB0_5
+; GFX942-GISEL-NEXT: .LBB0_6: ; %Flow2
+; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p0.i64(ptr addrspace(0) noundef nonnull align 4 %dst, i8 %setval, i64 %size, i1 false)
+ ret void
+}
+
+define void @memset_p1_varsize_align_4_varsetval(ptr addrspace(1) align 4 %dst, i8 %setval, i64 %size) {
+; GFX942-SDAG-LABEL: memset_p1_varsize_align_4_varsetval:
+; GFX942-SDAG: ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v3
+; GFX942-SDAG-NEXT: v_and_b32_e32 v12, -16, v10
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v4
+; GFX942-SDAG-NEXT: v_and_b32_e32 v8, 15, v10
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13]
+; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT: s_cbranch_execz .LBB1_3
+; GFX942-SDAG-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader
+; GFX942-SDAG-NEXT: s_mov_b32 s4, 0x4040404
+; GFX942-SDAG-NEXT: v_perm_b32 v4, v2, v2, s4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT: .LBB1_2: ; %dynamic-memset-expansion-main-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[14:15], v[0:1], 0, s[4:5]
+; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 16
+; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[12:13]
+; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[14:15], v[4:7], off
+; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB1_2
+; GFX942-SDAG-NEXT: .LBB1_3: ; %Flow4
+; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT: s_cbranch_execz .LBB1_6
+; GFX942-SDAG-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader
+; GFX942-SDAG-NEXT: v_and_b32_e32 v10, -16, v10
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[10:11]
+; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT: .LBB1_5: ; %dynamic-memset-expansion-residual-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[4:5]
+; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 1
+; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
+; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT: global_store_byte v[4:5], v2, off
+; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB1_5
+; GFX942-SDAG-NEXT: .LBB1_6: ; %Flow2
+; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_p1_varsize_align_4_varsetval:
+; GFX942-GISEL: ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v10, v3
+; GFX942-GISEL-NEXT: v_and_b32_e32 v8, 15, v10
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v11, v4
+; GFX942-GISEL-NEXT: v_sub_co_u32_e32 v12, vcc, v10, v8
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_subbrev_co_u32_e32 v13, vcc, 0, v11, vcc
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13]
+; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT: s_cbranch_execz .LBB1_3
+; GFX942-GISEL-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader
+; GFX942-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v2
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v4, v3, 8, v3
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX942-GISEL-NEXT: v_or3_b32 v4, v4, v5, v3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[0:1]
+; GFX942-GISEL-NEXT: .LBB1_2: ; %dynamic-memset-expansion-main-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v16, vcc, v0, v14
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v17, vcc, v1, v15, vcc
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, 16, v14
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[16:17], v[4:7], off
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc
+; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[14:15], v[12:13]
+; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB1_2
+; GFX942-GISEL-NEXT: .LBB1_3: ; %Flow4
+; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT: s_cbranch_execz .LBB1_6
+; GFX942-GISEL-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader
+; GFX942-GISEL-NEXT: v_lshrrev_b64 v[4:5], 4, v[10:11]
+; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 4, v[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT: .LBB1_5: ; %dynamic-memset-expansion-residual-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, v0, v4
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v5, vcc
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v4, vcc, 1, v4
+; GFX942-GISEL-NEXT: global_store_byte v[6:7], v2, off
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[4:5], v[8:9]
+; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB1_5
+; GFX942-GISEL-NEXT: .LBB1_6: ; %Flow2
+; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef nonnull align 4 %dst, i8 %setval, i64 %size, i1 false)
+ ret void
+}
+
+define void @memset_p3_varsize_align_4_varsetval(ptr addrspace(3) align 4 %dst, i8 %setval, i64 %size) {
+; GFX942-SDAG-LABEL: memset_p3_varsize_align_4_varsetval:
+; GFX942-SDAG: ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-SDAG-NEXT: v_and_b32_e32 v4, -16, v2
+; GFX942-SDAG-NEXT: v_and_b32_e32 v10, 15, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, 0
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT: s_cbranch_execz .LBB2_3
+; GFX942-SDAG-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader
+; GFX942-SDAG-NEXT: s_mov_b32 s4, 0x4040404
+; GFX942-SDAG-NEXT: v_perm_b32 v6, v1, v1, s4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v6
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v6
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v6
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v0
+; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT: .LBB2_2: ; %dynamic-memset-expansion-main-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 16
+; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[4:5]
+; GFX942-SDAG-NEXT: ds_write2_b32 v9, v8, v7 offset0:2 offset1:3
+; GFX942-SDAG-NEXT: ds_write2_b32 v9, v6, v3 offset1:1
+; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT: v_add_u32_e32 v9, 16, v9
+; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB2_2
+; GFX942-SDAG-NEXT: .LBB2_3: ; %Flow7
+; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT: s_cbranch_execz .LBB2_6
+; GFX942-SDAG-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader
+; GFX942-SDAG-NEXT: v_and_b32_e32 v2, -16, v2
+; GFX942-SDAG-NEXT: v_add_u32_e32 v0, v0, v2
+; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT: .LBB2_5: ; %dynamic-memset-expansion-residual-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 1
+; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[10:11]
+; GFX942-SDAG-NEXT: ds_write_b8 v0, v1
+; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 1, v0
+; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB2_5
+; GFX942-SDAG-NEXT: .LBB2_6: ; %Flow5
+; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_p3_varsize_align_4_varsetval:
+; GFX942-GISEL: ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_and_b32_e32 v4, 15, v2
+; GFX942-GISEL-NEXT: v_sub_co_u32_e32 v6, vcc, v2, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_subbrev_co_u32_e32 v7, vcc, 0, v3, vcc
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT: s_cbranch_execz .LBB2_3
+; GFX942-GISEL-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader
+; GFX942-GISEL-NEXT: v_and_b32_e32 v8, 0xff, v1
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v9, v8, 8, v8
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v10, 16, v8
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v8, 24, v8
+; GFX942-GISEL-NEXT: v_or3_b32 v8, v9, v10, v8
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v9, v8
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v10, v8
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v11, v8
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v14, v0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GFX942-GISEL-NEXT: .LBB2_2: ; %dynamic-memset-expansion-main-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 16, v12
+; GFX942-GISEL-NEXT: ds_write2_b64 v14, v[8:9], v[10:11] offset1:1
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
+; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[12:13], v[6:7]
+; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT: v_add_u32_e32 v14, 16, v14
+; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB2_2
+; GFX942-GISEL-NEXT: .LBB2_3: ; %Flow7
+; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT: s_cbranch_execz .LBB2_6
+; GFX942-GISEL-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader
+; GFX942-GISEL-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3]
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX942-GISEL-NEXT: v_add_u32_e32 v0, v0, v2
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT: .LBB2_5: ; %dynamic-memset-expansion-residual-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v2
+; GFX942-GISEL-NEXT: ds_write_b8 v0, v1
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[4:5]
+; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT: v_add_u32_e32 v0, 1, v0
+; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB2_5
+; GFX942-GISEL-NEXT: .LBB2_6: ; %Flow5
+; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p3.i64(ptr addrspace(3) noundef nonnull align 4 %dst, i8 %setval, i64 %size, i1 false)
+ ret void
+}
+
+define void @memset_p5_varsize_align_4_varsetval(ptr addrspace(5) align 4 %dst, i8 %setval, i64 %size) {
+; GFX942-SDAG-LABEL: memset_p5_varsize_align_4_varsetval:
+; GFX942-SDAG: ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v3
+; GFX942-SDAG-NEXT: v_and_b32_e32 v4, -16, v2
+; GFX942-SDAG-NEXT: v_and_b32_e32 v10, 15, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, 0
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT: s_cbranch_execz .LBB3_3
+; GFX942-SDAG-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader
+; GFX942-SDAG-NEXT: s_mov_b32 s4, 0x4040404
+; GFX942-SDAG-NEXT: v_perm_b32 v6, v1, v1, s4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v6
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v6
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v6
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v0
+; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT: .LBB3_2: ; %dynamic-memset-expansion-main-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 16
+; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[4:5]
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v3, v[6:9], off
+; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 16, v3
+; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB3_2
+; GFX942-SDAG-NEXT: .LBB3_3: ; %Flow7
+; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT: s_cbranch_execz .LBB3_6
+; GFX942-SDAG-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader
+; GFX942-SDAG-NEXT: v_and_b32_e32 v2, -16, v2
+; GFX942-SDAG-NEXT: v_add_u32_e32 v0, v0, v2
+; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT: .LBB3_5: ; %dynamic-memset-expansion-residual-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 1
+; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[10:11]
+; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off
+; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 1, v0
+; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB3_5
+; GFX942-SDAG-NEXT: .LBB3_6: ; %Flow5
+; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_p5_varsize_align_4_varsetval:
+; GFX942-GISEL: ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_and_b32_e32 v8, 15, v2
+; GFX942-GISEL-NEXT: v_sub_co_u32_e32 v10, vcc, v2, v8
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_subbrev_co_u32_e32 v11, vcc, 0, v3, vcc
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT: s_cbranch_execz .LBB3_3
+; GFX942-GISEL-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader
+; GFX942-GISEL-NEXT: v_and_b32_e32 v4, 0xff, v1
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v5, v4, 8, v4
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v6, 16, v4
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 24, v4
+; GFX942-GISEL-NEXT: v_or3_b32 v4, v5, v6, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v14, v0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GFX942-GISEL-NEXT: .LBB3_2: ; %dynamic-memset-expansion-main-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 16, v12
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v14, v[4:7], off
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
+; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[12:13], v[10:11]
+; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT: v_add_u32_e32 v14, 16, v14
+; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB3_2
+; GFX942-GISEL-NEXT: .LBB3_3: ; %Flow7
+; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT: s_cbranch_execz .LBB3_6
+; GFX942-GISEL-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader
+; GFX942-GISEL-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3]
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX942-GISEL-NEXT: v_add_u32_e32 v0, v0, v2
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT: .LBB3_5: ; %dynamic-memset-expansion-residual-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v2
+; GFX942-GISEL-NEXT: scratch_store_byte v0, v1, off
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[8:9]
+; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT: v_add_u32_e32 v0, 1, v0
+; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB3_5
+; GFX942-GISEL-NEXT: .LBB3_6: ; %Flow5
+; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p5.i64(ptr addrspace(5) noundef nonnull align 4 %dst, i8 %setval, i64 %size, i1 false)
+ ret void
+}
+
+define void @memset_p0_sz1055_align_4_varsetval(ptr addrspace(0) align 4 %dst, i8 %setval) {
+; GFX942-SDAG-LABEL: memset_p0_sz1055_align_4_varsetval:
+; GFX942-SDAG: ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404
+; GFX942-SDAG-NEXT: v_perm_b32 v4, v2, v2, s0
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v12, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v14, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v15, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v24, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v25, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v26, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v27, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v28, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v29, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v30, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v31, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v32, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v33, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v34, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v35, v4
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x70
+; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0x60
+; GFX942-SDAG-NEXT: s_mov_b64 s[6:7], 0x50
+; GFX942-SDAG-NEXT: s_mov_b64 s[8:9], 0xf0
+; GFX942-SDAG-NEXT: s_mov_b64 s[10:11], 0xe0
+; GFX942-SDAG-NEXT: s_mov_b64 s[12:13], 0xd0
+; GFX942-SDAG-NEXT: s_mov_b64 s[14:15], 0xc0
+; GFX942-SDAG-NEXT: s_mov_b64 s[16:17], 0xb0
+; GFX942-SDAG-NEXT: s_mov_b64 s[18:19], 0xa0
+; GFX942-SDAG-NEXT: s_mov_b64 s[20:21], 0x90
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[36:37], 0x400
+; GFX942-SDAG-NEXT: .LBB4_1: ; %static-memset-expansion-main-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[38:39], v[0:1], 0, s[0:1]
+; GFX942-SDAG-NEXT: s_add_u32 s0, s0, 0x100
+; GFX942-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX942-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[36:37]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[48:49], v[38:39], 0, s[2:3]
+; GFX942-SDAG-NEXT: s_and_b64 vcc, exec, vcc
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[50:51], v[38:39], 0, s[4:5]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[52:53], v[38:39], 0, s[6:7]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[38:39], v[20:23] offset:64
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[54:55], v[38:39], 0, 48
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[38:39], v[12:15] offset:32
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[38:39], v[8:11] offset:16
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[38:39], v[4:7]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[40:41], v[38:39], 0, s[8:9]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[42:43], v[38:39], 0, s[10:11]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[44:45], v[38:39], 0, s[12:13]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[46:47], v[38:39], 0, s[14:15]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[56:57], v[38:39], 0, s[16:17]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[58:59], v[38:39], 0, s[18:19]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[60:61], v[38:39], 0, s[20:21]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[38:39], v[4:7] offset:128
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[48:49], v[32:35]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[50:51], v[28:31]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[52:53], v[24:27]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[54:55], v[16:19]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[40:41], v[32:35]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[42:43], v[28:31]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[44:45], v[24:27]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[46:47], v[20:23]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[56:57], v[16:19]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[58:59], v[12:15]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[60:61], v[8:11]
+; GFX942-SDAG-NEXT: s_cbranch_vccnz .LBB4_1
+; GFX942-SDAG-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404
+; GFX942-SDAG-NEXT: v_perm_b32 v4, v2, v2, s0
+; GFX942-SDAG-NEXT: v_lshlrev_b16_e32 v3, 8, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-SDAG-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:1024
+; GFX942-SDAG-NEXT: flat_store_dwordx3 v[0:1], v[4:6] offset:1040
+; GFX942-SDAG-NEXT: flat_store_short v[0:1], v3 offset:1052
+; GFX942-SDAG-NEXT: flat_store_byte v[0:1], v2 offset:1054
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v61, a13 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_p0_sz1055_align_4_varsetval:
+; GFX942-GISEL: ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v2
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v4, v3, 8, v3
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX942-GISEL-NEXT: v_or3_b32 v4, v4, v5, v3
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], 0x400
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
+; GFX942-GISEL-NEXT: .LBB4_1: ; %static-memset-expansion-main-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v0, v10
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, v1, v11, vcc
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 0x100, v10
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:16
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:32
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:48
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:64
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:80
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:96
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:112
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:128
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:144
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:160
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:176
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:192
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:208
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:224
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[12:13], v[4:7] offset:240
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX942-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[8:9]
+; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB4_1
+; GFX942-GISEL-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; GFX942-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v2
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v4, v3, 8, v3
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX942-GISEL-NEXT: v_or3_b32 v4, v4, v5, v3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 8
+; GFX942-GISEL-NEXT: v_lshlrev_b16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-GISEL-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:1024
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v6, v5, 16, v5
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v6
+; GFX942-GISEL-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:1040
+; GFX942-GISEL-NEXT: flat_store_short v[0:1], v3 offset:1052
+; GFX942-GISEL-NEXT: flat_store_byte v[0:1], v2 offset:1054
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p0.i64(ptr addrspace(0) noundef nonnull align 4 %dst, i8 %setval, i64 1055, i1 false)
+ ret void
+}
+
+define void @memset_p0_sz2048_align_4_varsetval(ptr addrspace(0) align 4 %dst, i8 %setval) {
+; GFX942-SDAG-LABEL: memset_p0_sz2048_align_4_varsetval:
+; GFX942-SDAG: ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404
+; GFX942-SDAG-NEXT: v_perm_b32 v2, v2, v2, s0
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v12, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v14, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v15, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v24, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v25, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v26, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v27, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v28, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v29, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v30, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v31, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v32, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v33, v2
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: s_mov_b64 s[2:3], 0x70
+; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0x60
+; GFX942-SDAG-NEXT: s_mov_b64 s[6:7], 0x50
+; GFX942-SDAG-NEXT: s_mov_b64 s[8:9], 0xf0
+; GFX942-SDAG-NEXT: s_mov_b64 s[10:11], 0xe0
+; GFX942-SDAG-NEXT: s_mov_b64 s[12:13], 0xd0
+; GFX942-SDAG-NEXT: s_mov_b64 s[14:15], 0xc0
+; GFX942-SDAG-NEXT: s_mov_b64 s[16:17], 0xb0
+; GFX942-SDAG-NEXT: s_mov_b64 s[18:19], 0xa0
+; GFX942-SDAG-NEXT: s_mov_b64 s[20:21], 0x90
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[34:35], 0x800
+; GFX942-SDAG-NEXT: .LBB5_1: ; %static-memset-expansion-main-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[36:37], v[0:1], 0, s[0:1]
+; GFX942-SDAG-NEXT: s_add_u32 s0, s0, 0x100
+; GFX942-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX942-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[34:35]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[38:39], v[36:37], 0, s[2:3]
+; GFX942-SDAG-NEXT: s_and_b64 vcc, exec, vcc
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[48:49], v[36:37], 0, s[4:5]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[50:51], v[36:37], 0, s[6:7]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[36:37], v[18:21] offset:64
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[52:53], v[36:37], 0, 48
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[36:37], v[10:13] offset:32
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[36:37], v[6:9] offset:16
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[36:37], v[2:5]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[54:55], v[36:37], 0, s[8:9]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[40:41], v[36:37], 0, s[10:11]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[42:43], v[36:37], 0, s[12:13]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[44:45], v[36:37], 0, s[14:15]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[46:47], v[36:37], 0, s[16:17]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[56:57], v[36:37], 0, s[18:19]
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[58:59], v[36:37], 0, s[20:21]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[36:37], v[2:5] offset:128
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[38:39], v[30:33]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[48:49], v[26:29]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[50:51], v[22:25]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[52:53], v[14:17]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[54:55], v[30:33]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[40:41], v[26:29]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[42:43], v[22:25]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[44:45], v[18:21]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[46:47], v[14:17]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[56:57], v[10:13]
+; GFX942-SDAG-NEXT: flat_store_dwordx4 v[58:59], v[6:9]
+; GFX942-SDAG-NEXT: s_cbranch_vccnz .LBB5_1
+; GFX942-SDAG-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse
+; GFX942-SDAG-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_p0_sz2048_align_4_varsetval:
+; GFX942-GISEL: ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v3, v2, 8, v2
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GFX942-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], 0x800
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-GISEL-NEXT: .LBB5_1: ; %static-memset-expansion-main-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v8
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v9, vcc
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 0x100, v8
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5]
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:16
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:32
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:48
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:64
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:80
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:96
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:112
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:128
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:144
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:160
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:176
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:192
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:208
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:224
+; GFX942-GISEL-NEXT: flat_store_dwordx4 v[10:11], v[2:5] offset:240
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[6:7]
+; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB5_1
+; GFX942-GISEL-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p0.i64(ptr addrspace(0) noundef nonnull align 4 %dst, i8 %setval, i64 2048, i1 false)
+ ret void
+}
+
+define void @memset_p1_sz1055_align_4_varsetval(ptr addrspace(1) align 4 %dst, i8 %setval) {
+; GFX942-SDAG-LABEL: memset_p1_sz1055_align_4_varsetval:
+; GFX942-SDAG: ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404
+; GFX942-SDAG-NEXT: v_perm_b32 v4, v2, v2, s0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v12, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v14, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v15, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v24, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v25, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v26, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v27, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v28, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v29, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v30, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v31, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v32, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v33, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v34, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v35, v4
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[36:37], 0x400
+; GFX942-SDAG-NEXT: .LBB6_1: ; %static-memset-expansion-main-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[38:39], v[0:1], 0, s[0:1]
+; GFX942-SDAG-NEXT: s_add_u32 s0, s0, 0x100
+; GFX942-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX942-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[36:37]
+; GFX942-SDAG-NEXT: s_and_b64 vcc, exec, vcc
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[32:35], off offset:112
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[28:31], off offset:96
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[24:27], off offset:80
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[20:23], off offset:64
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[16:19], off offset:48
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[12:15], off offset:32
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[8:11], off offset:16
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[4:7], off
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[32:35], off offset:240
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[28:31], off offset:224
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[24:27], off offset:208
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[20:23], off offset:192
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[16:19], off offset:176
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[12:15], off offset:160
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[8:11], off offset:144
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[38:39], v[4:7], off offset:128
+; GFX942-SDAG-NEXT: s_cbranch_vccnz .LBB6_1
+; GFX942-SDAG-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404
+; GFX942-SDAG-NEXT: v_perm_b32 v4, v2, v2, s0
+; GFX942-SDAG-NEXT: v_lshlrev_b16_e32 v3, 8, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-SDAG-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:1024
+; GFX942-SDAG-NEXT: global_store_dwordx3 v[0:1], v[4:6], off offset:1040
+; GFX942-SDAG-NEXT: global_store_short v[0:1], v3, off offset:1052
+; GFX942-SDAG-NEXT: global_store_byte v[0:1], v2, off offset:1054
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_p1_sz1055_align_4_varsetval:
+; GFX942-GISEL: ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v2
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v4, v3, 8, v3
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX942-GISEL-NEXT: v_or3_b32 v4, v4, v5, v3
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], 0x400
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[0:1]
+; GFX942-GISEL-NEXT: .LBB6_1: ; %static-memset-expansion-main-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, v0, v10
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, v1, v11, vcc
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, 0x100, v10
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:48
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:64
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:80
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:96
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:112
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:128
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:144
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:160
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:176
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:192
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:208
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:224
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[12:13], v[4:7], off offset:240
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX942-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[8:9]
+; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB6_1
+; GFX942-GISEL-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; GFX942-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v2
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v4, v3, 8, v3
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; GFX942-GISEL-NEXT: v_or3_b32 v4, v4, v5, v3
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 8
+; GFX942-GISEL-NEXT: v_lshlrev_b16_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-GISEL-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:1024
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v8, v4
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_and_b32_e32 v5, 0xffff, v3
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v6, v5, 16, v5
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v7, v6
+; GFX942-GISEL-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:1040
+; GFX942-GISEL-NEXT: global_store_short v[0:1], v3, off offset:1052
+; GFX942-GISEL-NEXT: global_store_byte v[0:1], v2, off offset:1054
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef nonnull align 4 %dst, i8 %setval, i64 1055, i1 false)
+ ret void
+}
+
+define void @memset_p1_sz2048_align_4_varsetval(ptr addrspace(1) align 4 %dst, i8 %setval) {
+; GFX942-SDAG-LABEL: memset_p1_sz2048_align_4_varsetval:
+; GFX942-SDAG: ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404
+; GFX942-SDAG-NEXT: v_perm_b32 v2, v2, v2, s0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v12, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v14, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v15, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v24, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v25, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v26, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v27, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v28, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v29, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v30, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v31, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v32, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v33, v2
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[34:35], 0x800
+; GFX942-SDAG-NEXT: .LBB7_1: ; %static-memset-expansion-main-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[36:37], v[0:1], 0, s[0:1]
+; GFX942-SDAG-NEXT: s_add_u32 s0, s0, 0x100
+; GFX942-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX942-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[34:35]
+; GFX942-SDAG-NEXT: s_and_b64 vcc, exec, vcc
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[30:33], off offset:112
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[26:29], off offset:96
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[22:25], off offset:80
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[18:21], off offset:64
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[14:17], off offset:48
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[10:13], off offset:32
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[6:9], off offset:16
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[2:5], off
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[30:33], off offset:240
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[26:29], off offset:224
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[22:25], off offset:208
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[18:21], off offset:192
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[14:17], off offset:176
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[10:13], off offset:160
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[6:9], off offset:144
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[36:37], v[2:5], off offset:128
+; GFX942-SDAG-NEXT: s_cbranch_vccnz .LBB7_1
+; GFX942-SDAG-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_p1_sz2048_align_4_varsetval:
+; GFX942-GISEL: ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v3, v2, 8, v2
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GFX942-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], 0x800
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-GISEL-NEXT: .LBB7_1: ; %static-memset-expansion-main-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v10, vcc, v0, v8
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v9, vcc
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 0x100, v8
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:16
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:32
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:48
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:64
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:80
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:96
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:112
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:128
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:144
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:160
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:176
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:192
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:208
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:224
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[10:11], v[2:5], off offset:240
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[6:7]
+; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB7_1
+; GFX942-GISEL-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef nonnull align 4 %dst, i8 %setval, i64 2048, i1 false)
+ ret void
+}
+
+define void @memset_p3_sz1055_align_4_varsetval(ptr addrspace(3) align 4 %dst, i8 %setval) {
+; GFX942-SDAG-LABEL: memset_p3_sz1055_align_4_varsetval:
+; GFX942-SDAG: ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404
+; GFX942-SDAG-NEXT: v_perm_b32 v2, v1, v1, s0
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[34:35], 0x400
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v36, v0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v12, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v14, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v15, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v24, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v25, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v26, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v27, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v28, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v29, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v30, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v31, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v32, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v33, v2
+; GFX942-SDAG-NEXT: .LBB8_1: ; %static-memset-expansion-main-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: s_add_u32 s0, s0, 0x100
+; GFX942-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX942-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[34:35]
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v32, v33 offset0:30 offset1:31
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v30, v31 offset0:28 offset1:29
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v28, v29 offset0:26 offset1:27
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v26, v27 offset0:24 offset1:25
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v24, v25 offset0:22 offset1:23
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v22, v23 offset0:20 offset1:21
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v20, v21 offset0:18 offset1:19
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v18, v19 offset0:16 offset1:17
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v16, v17 offset0:14 offset1:15
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v14, v15 offset0:12 offset1:13
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v12, v13 offset0:10 offset1:11
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v10, v11 offset0:8 offset1:9
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v8, v9 offset0:6 offset1:7
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v6, v7 offset0:4 offset1:5
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v4, v5 offset0:2 offset1:3
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v2, v3 offset1:1
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v32, v33 offset0:62 offset1:63
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v30, v31 offset0:60 offset1:61
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v28, v29 offset0:58 offset1:59
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v26, v27 offset0:56 offset1:57
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v24, v25 offset0:54 offset1:55
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v22, v23 offset0:52 offset1:53
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v20, v21 offset0:50 offset1:51
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v18, v19 offset0:48 offset1:49
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v16, v17 offset0:46 offset1:47
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v14, v15 offset0:44 offset1:45
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v12, v13 offset0:42 offset1:43
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v10, v11 offset0:40 offset1:41
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v8, v9 offset0:38 offset1:39
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v6, v7 offset0:36 offset1:37
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v4, v5 offset0:34 offset1:35
+; GFX942-SDAG-NEXT: ds_write2_b32 v36, v2, v3 offset0:32 offset1:33
+; GFX942-SDAG-NEXT: v_add_u32_e32 v36, 0x100, v36
+; GFX942-SDAG-NEXT: s_cbranch_vccnz .LBB8_1
+; GFX942-SDAG-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404
+; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 0x400, v0
+; GFX942-SDAG-NEXT: v_add_u32_e32 v3, 0x408, v0
+; GFX942-SDAG-NEXT: v_perm_b32 v4, v1, v1, s0
+; GFX942-SDAG-NEXT: ds_write2_b32 v3, v4, v4 offset1:1
+; GFX942-SDAG-NEXT: ds_write2_b32 v2, v4, v4 offset1:1
+; GFX942-SDAG-NEXT: v_add_u32_e32 v2, 0x410, v0
+; GFX942-SDAG-NEXT: ds_write2_b32 v2, v4, v4 offset1:1
+; GFX942-SDAG-NEXT: ds_write_b32 v0, v4 offset:1048
+; GFX942-SDAG-NEXT: v_lshlrev_b16_e32 v2, 8, v1
+; GFX942-SDAG-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-SDAG-NEXT: ds_write_b16 v0, v2 offset:1052
+; GFX942-SDAG-NEXT: ds_write_b8 v0, v1 offset:1054
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_p3_sz1055_align_4_varsetval:
+; GFX942-GISEL: ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v1
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v3, v2, 8, v2
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GFX942-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], 0x400
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-GISEL-NEXT: .LBB8_1: ; %static-memset-expansion-main-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 0x100, v8
+; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset1:1
+; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:2 offset1:3
+; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:4 offset1:5
+; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:6 offset1:7
+; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:8 offset1:9
+; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:10 offset1:11
+; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:12 offset1:13
+; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:14 offset1:15
+; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:16 offset1:17
+; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:18 offset1:19
+; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:20 offset1:21
+; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:22 offset1:23
+; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:24 offset1:25
+; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:26 offset1:27
+; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:28 offset1:29
+; GFX942-GISEL-NEXT: ds_write2_b64 v10, v[2:3], v[4:5] offset0:30 offset1:31
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-GISEL-NEXT: v_add_u32_e32 v10, 0x100, v10
+; GFX942-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[6:7]
+; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB8_1
+; GFX942-GISEL-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; GFX942-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v1
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v3, v2, 8, v2
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GFX942-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:128 offset1:129
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 8
+; GFX942-GISEL-NEXT: v_lshlrev_b16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX942-GISEL-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v3
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v4
+; GFX942-GISEL-NEXT: v_add_u32_e32 v5, 0x410, v0
+; GFX942-GISEL-NEXT: ds_write2_b32 v5, v4, v4 offset1:1
+; GFX942-GISEL-NEXT: ds_write_b32 v0, v2 offset:1048
+; GFX942-GISEL-NEXT: ds_write_b16 v0, v3 offset:1052
+; GFX942-GISEL-NEXT: ds_write_b8 v0, v1 offset:1054
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p3.i64(ptr addrspace(3) noundef nonnull align 4 %dst, i8 %setval, i64 1055, i1 false)
+ ret void
+}
+
+define void @memset_p3_sz2048_align_4_varsetval(ptr addrspace(3) align 4 %dst, i8 %setval) {
+; GFX942-SDAG-LABEL: memset_p3_sz2048_align_4_varsetval:
+; GFX942-SDAG: ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404
+; GFX942-SDAG-NEXT: v_perm_b32 v2, v1, v1, s0
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[34:35], 0x800
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v1, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v12, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v14, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v15, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v24, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v26, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v25, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v28, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v27, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v30, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v29, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v32, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v31, v2
+; GFX942-SDAG-NEXT: .LBB9_1: ; %static-memset-expansion-main-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: s_add_u32 s0, s0, 0x100
+; GFX942-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX942-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[34:35]
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v32, v31 offset0:30 offset1:31
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v30, v29 offset0:28 offset1:29
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v28, v27 offset0:26 offset1:27
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v26, v25 offset0:24 offset1:25
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v24, v23 offset0:22 offset1:23
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v22, v21 offset0:20 offset1:21
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v20, v19 offset0:18 offset1:19
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v18, v17 offset0:16 offset1:17
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v16, v15 offset0:14 offset1:15
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v14, v13 offset0:12 offset1:13
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v12, v11 offset0:10 offset1:11
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v10, v9 offset0:8 offset1:9
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v8, v7 offset0:6 offset1:7
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v6, v5 offset0:4 offset1:5
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v4, v3 offset0:2 offset1:3
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v2, v1 offset1:1
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v32, v31 offset0:62 offset1:63
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v30, v29 offset0:60 offset1:61
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v28, v27 offset0:58 offset1:59
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v26, v25 offset0:56 offset1:57
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v24, v23 offset0:54 offset1:55
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v22, v21 offset0:52 offset1:53
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v20, v19 offset0:50 offset1:51
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v18, v17 offset0:48 offset1:49
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v16, v15 offset0:46 offset1:47
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v14, v13 offset0:44 offset1:45
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v12, v11 offset0:42 offset1:43
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v10, v9 offset0:40 offset1:41
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v8, v7 offset0:38 offset1:39
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v6, v5 offset0:36 offset1:37
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v4, v3 offset0:34 offset1:35
+; GFX942-SDAG-NEXT: ds_write2_b32 v0, v2, v1 offset0:32 offset1:33
+; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 0x100, v0
+; GFX942-SDAG-NEXT: s_cbranch_vccnz .LBB9_1
+; GFX942-SDAG-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; GFX942-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_p3_sz2048_align_4_varsetval:
+; GFX942-GISEL: ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v2, v1, 8, v1
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v1, 24, v1
+; GFX942-GISEL-NEXT: v_or3_b32 v2, v2, v3, v1
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], 0x800
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-GISEL-NEXT: .LBB9_1: ; %static-memset-expansion-main-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 0x100, v8
+; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
+; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:2 offset1:3
+; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:4 offset1:5
+; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:6 offset1:7
+; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:8 offset1:9
+; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:10 offset1:11
+; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:12 offset1:13
+; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:14 offset1:15
+; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:16 offset1:17
+; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:18 offset1:19
+; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:20 offset1:21
+; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:22 offset1:23
+; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:24 offset1:25
+; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:26 offset1:27
+; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:28 offset1:29
+; GFX942-GISEL-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset0:30 offset1:31
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-GISEL-NEXT: v_add_u32_e32 v0, 0x100, v0
+; GFX942-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[6:7]
+; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB9_1
+; GFX942-GISEL-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; GFX942-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p3.i64(ptr addrspace(3) noundef nonnull align 4 %dst, i8 %setval, i64 2048, i1 false)
+ ret void
+}
+
+define void @memset_p5_sz1055_align_4_varsetval(ptr addrspace(5) align 4 %dst, i8 %setval) {
+; GFX942-SDAG-LABEL: memset_p5_sz1055_align_4_varsetval:
+; GFX942-SDAG: ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404
+; GFX942-SDAG-NEXT: v_perm_b32 v2, v1, v1, s0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v12, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v14, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v15, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v24, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v25, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v26, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v27, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v28, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v29, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v30, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v31, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v32, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v33, v2
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[34:35], 0x400
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v36, v0
+; GFX942-SDAG-NEXT: .LBB10_1: ; %static-memset-expansion-main-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: s_add_u32 s0, s0, 0x100
+; GFX942-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX942-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[34:35]
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[30:33], off offset:112
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[26:29], off offset:96
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[22:25], off offset:80
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[18:21], off offset:64
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[14:17], off offset:48
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[10:13], off offset:32
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[6:9], off offset:16
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[2:5], off
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[30:33], off offset:240
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[26:29], off offset:224
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[22:25], off offset:208
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[18:21], off offset:192
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[14:17], off offset:176
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[10:13], off offset:160
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[6:9], off offset:144
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v36, v[2:5], off offset:128
+; GFX942-SDAG-NEXT: v_add_u32_e32 v36, 0x100, v36
+; GFX942-SDAG-NEXT: s_cbranch_vccnz .LBB10_1
+; GFX942-SDAG-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404
+; GFX942-SDAG-NEXT: v_perm_b32 v2, v1, v1, s0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:1024
+; GFX942-SDAG-NEXT: scratch_store_dwordx2 v0, v[2:3], off offset:1040
+; GFX942-SDAG-NEXT: scratch_store_dword v0, v2, off offset:1048
+; GFX942-SDAG-NEXT: v_lshlrev_b16_e32 v2, 8, v1
+; GFX942-SDAG-NEXT: v_or_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-SDAG-NEXT: scratch_store_short v0, v2, off offset:1052
+; GFX942-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1054
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_p5_sz1055_align_4_varsetval:
+; GFX942-GISEL: ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v1
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v3, v2, 8, v2
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GFX942-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], 0x400
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v10, v0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-GISEL-NEXT: .LBB10_1: ; %static-memset-expansion-main-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 0x100, v8
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:16
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:32
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:48
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:64
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:80
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:96
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:112
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:128
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:144
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:160
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:176
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:192
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:208
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:224
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v10, v[2:5], off offset:240
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-GISEL-NEXT: v_add_u32_e32 v10, 0x100, v10
+; GFX942-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[6:7]
+; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB10_1
+; GFX942-GISEL-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; GFX942-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v1
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v3, v2, 8, v2
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; GFX942-GISEL-NEXT: v_or3_b32 v2, v3, v4, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:1024
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, 8
+; GFX942-GISEL-NEXT: v_lshlrev_b16_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX942-GISEL-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v3
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v4, v4, 16, v4
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v4
+; GFX942-GISEL-NEXT: scratch_store_dwordx2 v0, v[4:5], off offset:1040
+; GFX942-GISEL-NEXT: scratch_store_dword v0, v2, off offset:1048
+; GFX942-GISEL-NEXT: scratch_store_short v0, v3, off offset:1052
+; GFX942-GISEL-NEXT: scratch_store_byte v0, v1, off offset:1054
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p5.i64(ptr addrspace(5) noundef nonnull align 4 %dst, i8 %setval, i64 1055, i1 false)
+ ret void
+}
+
+define void @memset_p5_sz2048_align_4_varsetval(ptr addrspace(5) align 4 %dst, i8 %setval) {
+; GFX942-SDAG-LABEL: memset_p5_sz2048_align_4_varsetval:
+; GFX942-SDAG: ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: s_mov_b32 s0, 0x4040404
+; GFX942-SDAG-NEXT: v_perm_b32 v2, v1, v1, s0
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v12, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v13, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v14, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v15, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v16, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v17, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v18, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v19, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v20, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v21, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v22, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v23, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v24, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v25, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v26, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v27, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v28, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v29, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v30, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v31, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v32, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v33, v2
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_mov_b64_e32 v[34:35], 0x800
+; GFX942-SDAG-NEXT: .LBB11_1: ; %static-memset-expansion-main-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: s_add_u32 s0, s0, 0x100
+; GFX942-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX942-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[34:35]
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[30:33], off offset:112
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[26:29], off offset:96
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[22:25], off offset:80
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[18:21], off offset:64
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[14:17], off offset:48
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:32
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:16
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[2:5], off
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[30:33], off offset:240
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[26:29], off offset:224
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[22:25], off offset:208
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[18:21], off offset:192
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[14:17], off offset:176
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[10:13], off offset:160
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[6:9], off offset:144
+; GFX942-SDAG-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:128
+; GFX942-SDAG-NEXT: v_add_u32_e32 v0, 0x100, v0
+; GFX942-SDAG-NEXT: s_cbranch_vccnz .LBB11_1
+; GFX942-SDAG-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_p5_sz2048_align_4_varsetval:
+; GFX942-GISEL: ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX942-GISEL-NEXT: v_lshl_or_b32 v2, v1, 8, v1
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v1
+; GFX942-GISEL-NEXT: v_lshlrev_b32_e32 v1, 24, v1
+; GFX942-GISEL-NEXT: v_or3_b32 v2, v2, v3, v1
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, v2
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], 0x800
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-GISEL-NEXT: .LBB11_1: ; %static-memset-expansion-main-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v8, vcc, 0x100, v8
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:16
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:32
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:48
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:64
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:80
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:96
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:112
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:128
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:144
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:160
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:176
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:192
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:208
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:224
+; GFX942-GISEL-NEXT: scratch_store_dwordx4 v0, v[2:5], off offset:240
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-GISEL-NEXT: v_add_u32_e32 v0, 0x100, v0
+; GFX942-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[6:7]
+; GFX942-GISEL-NEXT: s_cbranch_vccnz .LBB11_1
+; GFX942-GISEL-NEXT: ; %bb.2: ; %static-memset-post-expansion
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p5.i64(ptr addrspace(5) noundef nonnull align 4 %dst, i8 %setval, i64 2048, i1 false)
+ ret void
+}
+
+define void @memset_p1_varsz_align_4_set40(ptr addrspace(1) align 4 %dst, i64 %size) {
+; GFX942-SDAG-LABEL: memset_p1_varsz_align_4_set40:
+; GFX942-SDAG: ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_and_b32_e32 v10, -16, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v3
+; GFX942-SDAG-NEXT: v_and_b32_e32 v8, 15, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT: s_cbranch_execz .LBB12_3
+; GFX942-SDAG-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v4, 0x28282828
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v6, v4
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v4
+; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT: .LBB12_2: ; %dynamic-memset-expansion-main-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[12:13], v[0:1], 0, s[4:5]
+; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 16
+; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[10:11]
+; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[12:13], v[4:7], off
+; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB12_2
+; GFX942-SDAG-NEXT: .LBB12_3: ; %Flow4
+; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT: s_cbranch_execz .LBB12_6
+; GFX942-SDAG-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader
+; GFX942-SDAG-NEXT: v_and_b32_e32 v2, -16, v2
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 40
+; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT: .LBB12_5: ; %dynamic-memset-expansion-residual-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[4:5]
+; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 1
+; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[8:9]
+; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT: global_store_byte v[4:5], v2, off
+; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB12_5
+; GFX942-SDAG-NEXT: .LBB12_6: ; %Flow2
+; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_p1_varsz_align_4_set40:
+; GFX942-GISEL: ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_and_b32_e32 v8, 15, v2
+; GFX942-GISEL-NEXT: v_sub_co_u32_e32 v10, vcc, v2, v8
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_subbrev_co_u32_e32 v11, vcc, 0, v3, vcc
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT: s_cbranch_execz .LBB12_3
+; GFX942-GISEL-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader
+; GFX942-GISEL-NEXT: s_mov_b32 s4, 0x28282828
+; GFX942-GISEL-NEXT: s_mov_b32 s5, s4
+; GFX942-GISEL-NEXT: s_mov_b32 s6, s4
+; GFX942-GISEL-NEXT: s_mov_b32 s7, s4
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
+; GFX942-GISEL-NEXT: .LBB12_2: ; %dynamic-memset-expansion-main-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, v0, v12
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, v1, v13, vcc
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 16, v12
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[14:15], v[4:7], off
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
+; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[12:13], v[10:11]
+; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB12_2
+; GFX942-GISEL-NEXT: .LBB12_3: ; %Flow4
+; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT: s_cbranch_execz .LBB12_6
+; GFX942-GISEL-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader
+; GFX942-GISEL-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3]
+; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 4, v[0:1]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 40
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT: .LBB12_5: ; %dynamic-memset-expansion-residual-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v3, vcc
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v2
+; GFX942-GISEL-NEXT: global_store_byte v[6:7], v4, off
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[8:9]
+; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB12_5
+; GFX942-GISEL-NEXT: .LBB12_6: ; %Flow2
+; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef nonnull align 4 %dst, i8 40, i64 %size, i1 false)
+ ret void
+}
+
+define void @memset_p1_varsz_align_4_set0(ptr addrspace(1) align 4 %dst, i64 %size) {
+; GFX942-SDAG-LABEL: memset_p1_varsz_align_4_set0:
+; GFX942-SDAG: ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT: v_and_b32_e32 v6, -16, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v7, v3
+; GFX942-SDAG-NEXT: v_and_b32_e32 v4, 15, v2
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v5, 0
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT: s_cbranch_execz .LBB13_3
+; GFX942-SDAG-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v8, v5
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v9, v5
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v10, v5
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v11, v5
+; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT: .LBB13_2: ; %dynamic-memset-expansion-main-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[12:13], v[0:1], 0, s[4:5]
+; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 16
+; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[6:7]
+; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT: global_store_dwordx4 v[12:13], v[8:11], off
+; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB13_2
+; GFX942-SDAG-NEXT: .LBB13_3: ; %Flow4
+; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX942-SDAG-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT: s_cbranch_execz .LBB13_6
+; GFX942-SDAG-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader
+; GFX942-SDAG-NEXT: v_and_b32_e32 v2, -16, v2
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT: s_mov_b64 s[4:5], 0
+; GFX942-SDAG-NEXT: .LBB13_5: ; %dynamic-memset-expansion-residual-body
+; GFX942-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[4:5]
+; GFX942-SDAG-NEXT: s_add_u32 s4, s4, 1
+; GFX942-SDAG-NEXT: s_addc_u32 s5, s5, 0
+; GFX942-SDAG-NEXT: v_cmp_ge_u64_e32 vcc, s[4:5], v[4:5]
+; GFX942-SDAG-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT: global_store_byte v[6:7], v2, off
+; GFX942-SDAG-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT: s_cbranch_execnz .LBB13_5
+; GFX942-SDAG-NEXT: .LBB13_6: ; %Flow2
+; GFX942-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: memset_p1_varsz_align_4_set0:
+; GFX942-GISEL: ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT: v_and_b32_e32 v8, 15, v2
+; GFX942-GISEL-NEXT: v_sub_co_u32_e32 v10, vcc, v2, v8
+; GFX942-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_subbrev_co_u32_e32 v11, vcc, 0, v3, vcc
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v9, 0
+; GFX942-GISEL-NEXT: s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX942-GISEL-NEXT: s_cbranch_execz .LBB13_3
+; GFX942-GISEL-NEXT: ; %bb.1: ; %dynamic-memset-expansion-main-body.preheader
+; GFX942-GISEL-NEXT: s_mov_b32 s2, s0
+; GFX942-GISEL-NEXT: s_mov_b32 s3, s0
+; GFX942-GISEL-NEXT: s_mov_b32 s1, s0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[2:3]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[4:5]
+; GFX942-GISEL-NEXT: .LBB13_2: ; %dynamic-memset-expansion-main-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v14, vcc, v0, v12
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v15, vcc, v1, v13, vcc
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v12, vcc, 16, v12
+; GFX942-GISEL-NEXT: global_store_dwordx4 v[14:15], v[4:7], off
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
+; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[12:13], v[10:11]
+; GFX942-GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB13_2
+; GFX942-GISEL-NEXT: .LBB13_3: ; %Flow4
+; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX942-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX942-GISEL-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT: s_cbranch_execz .LBB13_6
+; GFX942-GISEL-NEXT: ; %bb.4: ; %dynamic-memset-expansion-residual-body.preheader
+; GFX942-GISEL-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3]
+; GFX942-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 4, v[0:1]
+; GFX942-GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-GISEL-NEXT: .LBB13_5: ; %dynamic-memset-expansion-residual-body
+; GFX942-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX942-GISEL-NEXT: s_nop 1
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v3, vcc
+; GFX942-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v2
+; GFX942-GISEL-NEXT: global_store_byte v[6:7], v4, off
+; GFX942-GISEL-NEXT: s_nop 0
+; GFX942-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX942-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[8:9]
+; GFX942-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX942-GISEL-NEXT: s_cbranch_execnz .LBB13_5
+; GFX942-GISEL-NEXT: .LBB13_6: ; %Flow2
+; GFX942-GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT: s_setpc_b64 s[30:31]
+entry:
+ tail call void @llvm.memset.p1.i64(ptr addrspace(1) noundef nonnull align 4 %dst, i8 0, i64 %size, i1 false)
+ ret void
+}
+
+declare void @llvm.memset.p0.i64(ptr addrspace(0) noalias nocapture writeonly, i8, i64, i1 immarg)
+declare void @llvm.memset.p1.i64(ptr addrspace(1) noalias nocapture writeonly, i8, i64, i1 immarg)
+declare void @llvm.memset.p3.i64(ptr addrspace(3) noalias nocapture writeonly, i8, i64, i1 immarg)
+declare void @llvm.memset.p5.i64(ptr addrspace(5) noalias nocapture writeonly, i8, i64, i1 immarg)
+
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX942: {{.*}}
diff --git a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
index ad78e0fe7438b..6c96eab1439fb 100644
--- a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -117,9 +117,9 @@ entry:
; IR-LABEL: @memset_caller
; IR: [[VAL:%[0-9]+]] = trunc i32 %c to i8
-; IR: [[CMPREG:%[0-9]+]] = icmp eq i64 0, %n
-; IR: br i1 [[CMPREG]], label %split, label %loadstoreloop
-; IR: loadstoreloop:
+; IR: [[CMPREG:%[0-9]+]] = icmp ne i64 %n, 0
+; IR: br i1 [[CMPREG]], label %dynamic-memset-expansion-main-body, label %dynamic-memset-post-expansion
+; IR: dynamic-memset-expansion-main-body:
; IR: [[STOREPTR:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64
; IR-NEXT: store i8 [[VAL]], ptr [[STOREPTR]]
@@ -141,7 +141,7 @@ entry:
; IR-LABEL: @volatile_memset_caller
; IR: [[VAL:%[0-9]+]] = trunc i32 %c to i8
-; IR: loadstoreloop:
+; IR: dynamic-memset-expansion-main-body:
; IR: [[STOREPTR:%[0-9]+]] = getelementptr inbounds i8, ptr %dst, i64
; IR-NEXT: store volatile i8 [[VAL]], ptr [[STOREPTR]]
}
diff --git a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll
index d5e70ae9e7aa8..eb184bae7fa5c 100644
--- a/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll
+++ b/llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll
@@ -39,8 +39,8 @@
; CHECK: %[[#Volatile:]] = OpFunctionParameter %[[#]]
; CHECK: %[[#Entry:]] = OpLabel
-; CHECK: %[[#IsZeroLen:]] = OpIEqual %[[#]] %[[#Zero:]] %[[#Len]]
-; CHECK: OpBranchConditional %[[#IsZeroLen]] %[[#End:]] %[[#WhileBody:]]
+; CHECK: %[[#IsNonZeroLen:]] = OpINotEqual %[[#]] %[[#Len]] %[[#Zero:]]
+; CHECK: OpBranchConditional %[[#IsNonZeroLen]] %[[#WhileBody:]] %[[#End:]]
; CHECK: %[[#WhileBody]] = OpLabel
; CHECK: %[[#Offset:]] = OpPhi %[[#]] %[[#Zero]] %[[#Entry]] %[[#OffsetInc:]] %[[#WhileBody]]
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-inline-non-constant-len.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-inline-non-constant-len.ll
index 0843b1532f843..8a371083548eb 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-inline-non-constant-len.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-inline-non-constant-len.ll
@@ -17,26 +17,26 @@ define void @memset_32(ptr %a, i8 %value) nounwind {
define void @memset_x(ptr %a, i8 %value, i64 %x) nounwind {
; CHECK-LABEL: define void @memset_x(
; CHECK-SAME: ptr [[A:%.*]], i8 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 0, [[X]]
-; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
-; CHECK: [[LOADSTORELOOP]]:
-; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[X]], 0
+; CHECK-NEXT: br i1 [[TMP1]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY:.*]], label %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION:.*]]
+; CHECK: [[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]]:
+; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP2]]
; CHECK-NEXT: store i8 [[VALUE]], ptr [[TMP3]], align 1
; CHECK-NEXT: [[TMP4]] = add i64 [[TMP2]], 1
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[X]]
-; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
-; CHECK: [[SPLIT]]:
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 0, [[X]]
-; CHECK-NEXT: br i1 [[TMP6]], label %[[SPLIT1:.*]], label %[[LOADSTORELOOP2:.*]]
-; CHECK: [[LOADSTORELOOP2]]:
-; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ 0, %[[SPLIT]] ], [ [[TMP9:%.*]], %[[LOADSTORELOOP2]] ]
+; CHECK-NEXT: br i1 [[TMP5]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY]], label %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION]]
+; CHECK: [[DYNAMIC_MEMSET_POST_LOOP_EXPANSION]]:
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[X]], 0
+; CHECK-NEXT: br i1 [[TMP6]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY2:.*]], label %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION1:.*]]
+; CHECK: [[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY2]]:
+; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ 0, %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION]] ], [ [[TMP9:%.*]], %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY2]] ]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP7]]
; CHECK-NEXT: store volatile i8 [[VALUE]], ptr [[TMP8]], align 1
; CHECK-NEXT: [[TMP9]] = add i64 [[TMP7]], 1
; CHECK-NEXT: [[TMP10:%.*]] = icmp ult i64 [[TMP9]], [[X]]
-; CHECK-NEXT: br i1 [[TMP10]], label %[[LOADSTORELOOP2]], label %[[SPLIT1]]
-; CHECK: [[SPLIT1]]:
+; CHECK-NEXT: br i1 [[TMP10]], label %[[DYNAMIC_MEMSET_LOOP_EXPANSION_MAIN_BODY2]], label %[[DYNAMIC_MEMSET_POST_LOOP_EXPANSION1]]
+; CHECK: [[DYNAMIC_MEMSET_POST_LOOP_EXPANSION1]]:
; CHECK-NEXT: ret void
;
call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 %x, i1 0)
More information about the llvm-branch-commits
mailing list