[llvm-branch-commits] [llvm] [LowerMemIntrinsics] Optimize memset lowering (PR #169040)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Nov 21 05:57:11 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Fabian Ritter (ritter-x2a)
<details>
<summary>Changes</summary>
This patch changes the memset lowering to match the optimized memcpy lowering.
The memset lowering now queries TTI.getMemcpyLoopLoweringType for a preferred
memory access type. If that type is larger than a byte, the memset is lowered
into two loops: a main loop that stores a sufficiently wide vector splat of the
SetValue with the preferred memory access type and a residual loop that covers
the remaining bytes individually. If the memset size is statically known, the
residual loop is replaced by a sequence of stores.
This improves memset performance on gfx1030 (AMDGPU) in microbenchmarks by
around 7-20x.
I'm planning similar treatment for memset.pattern as a follow-up PR.
For SWDEV-543208.
---
Patch is 343.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169040.diff
17 Files Affected:
- (modified) llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h (+2-1)
- (modified) llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp (+4-2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp (+2-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+2-1)
- (modified) llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp (+1-1)
- (modified) llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp (+7-4)
- (modified) llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp (+197-7)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll (+103-11)
- (modified) llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll (+686-90)
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll (+218-116)
- (modified) llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll (+15-36)
- (modified) llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll (+55-13)
- (modified) llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll (+1616)
- (added) llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll (+1900)
- (modified) llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll (+4-4)
- (modified) llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll (+2-2)
- (modified) llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-inline-non-constant-len.ll (+12-12)
``````````diff
diff --git a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
index d4e72a60fc1ea..8924b8b1e6e54 100644
--- a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
+++ b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
@@ -59,7 +59,8 @@ LLVM_ABI bool expandMemMoveAsLoop(MemMoveInst *MemMove,
const TargetTransformInfo &TTI);
/// Expand \p MemSet as a loop. \p MemSet is not deleted.
-LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet);
+LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet,
+ const TargetTransformInfo &TTI);
/// Expand \p MemSetPattern as a loop. \p MemSet is not deleted.
LLVM_ABI void expandMemSetPatternAsLoop(MemSetPatternInst *MemSet);
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index d738dc4eea36d..88e2bb81f9e3b 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -369,7 +369,7 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
canEmitLibcall(TM, ParentFunc, RTLIB::MEMSET))
break;
- expandMemSetAsLoop(Memset);
+ expandMemSetAsLoop(Memset, TTI);
Changed = true;
Memset->eraseFromParent();
}
@@ -384,7 +384,9 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
if (isa<ConstantInt>(Memset->getLength()))
break;
- expandMemSetAsLoop(Memset);
+ Function *ParentFunc = Memset->getFunction();
+ const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
+ expandMemSetAsLoop(Memset, TTI);
Changed = true;
Memset->eraseFromParent();
break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index fdff21b6ef8df..76f1e006bbf74 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -635,7 +635,8 @@ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst(
MemSetInst &MSI) {
if (MSI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
return false;
- llvm::expandMemSetAsLoop(&MSI);
+ llvm::expandMemSetAsLoop(&MSI,
+ TM->getTargetTransformInfo(*MSI.getFunction()));
MSI.eraseFromParent();
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 03d16fdd54c42..5a68dca1b10b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -80,7 +80,8 @@ static cl::opt<size_t> InlineMaxBB(
static cl::opt<unsigned> MemcpyLoopUnroll(
"amdgpu-memcpy-loop-unroll",
cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
- "operations when lowering memcpy as a loop"),
+ "operations when lowering statically-sized memcpy, memmove, or"
+ "memset as a loop"),
cl::init(16), cl::Hidden);
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index ac6f4061b9f1f..d0b50d2610bd5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -128,7 +128,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
} else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) {
expandMemMoveAsLoop(Memmove, TTI);
} else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) {
- expandMemSetAsLoop(Memset);
+ expandMemSetAsLoop(Memset, TTI);
}
MemCall->eraseFromParent();
}
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index be88f334d2171..8bee6da75cc75 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -23,6 +23,7 @@
#include "SPIRVTargetMachine.h"
#include "SPIRVUtils.h"
#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/IR/IRBuilder.h"
@@ -93,7 +94,8 @@ static Function *getOrCreateFunction(Module *M, Type *RetTy,
return NewF;
}
-static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic) {
+static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic,
+ const TargetTransformInfo &TTI) {
// For @llvm.memset.* intrinsic cases with constant value and length arguments
// are emulated via "storing" a constant array to the destination. For other
// cases we wrap the intrinsic in @spirv.llvm_memset_* function and expand the
@@ -137,7 +139,7 @@ static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic) {
auto *MemSet = IRB.CreateMemSet(Dest, Val, Len, MSI->getDestAlign(),
MSI->isVolatile());
IRB.CreateRetVoid();
- expandMemSetAsLoop(cast<MemSetInst>(MemSet));
+ expandMemSetAsLoop(cast<MemSetInst>(MemSet), TTI);
MemSet->eraseFromParent();
break;
}
@@ -399,6 +401,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
bool Changed = false;
const SPIRVSubtarget &STI = TM.getSubtarget<SPIRVSubtarget>(*F);
SmallVector<Instruction *> EraseFromParent;
+ const TargetTransformInfo &TTI = TM.getTargetTransformInfo(*F);
for (BasicBlock &BB : *F) {
for (Instruction &I : make_early_inc_range(BB)) {
auto Call = dyn_cast<CallInst>(&I);
@@ -411,7 +414,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
switch (II->getIntrinsicID()) {
case Intrinsic::memset:
case Intrinsic::bswap:
- Changed |= lowerIntrinsicToFunction(II);
+ Changed |= lowerIntrinsicToFunction(II, TTI);
break;
case Intrinsic::fshl:
case Intrinsic::fshr:
@@ -459,7 +462,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
return false;
return II->getCalledFunction()->getName().starts_with(Prefix);
}))
- Changed |= lowerIntrinsicToFunction(II);
+ Changed |= lowerIntrinsicToFunction(II, TTI);
break;
}
}
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 88e5d038bff82..07bfceb99d206 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -930,9 +930,187 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
}
}
+/// Create a Value of \p DstType that consists of a sequence of copies of
+/// \p SetValue, using bitcasts and a vector splat.
+static Value *createMemSetSplat(const DataLayout &DL, IRBuilderBase &B,
+ Value *SetValue, Type *DstType) {
+ unsigned DstSize = DL.getTypeStoreSize(DstType);
+ Type *SetValueType = SetValue->getType();
+ unsigned SetValueSize = DL.getTypeStoreSize(SetValueType);
+ assert(SetValueSize == DL.getTypeAllocSize(SetValueType) &&
+ "Store size and alloc size of SetValue's type must match");
+ assert(SetValueSize != 0 && DstSize % SetValueSize == 0 &&
+ "DstType size must be a multiple of SetValue size");
+
+ Value *Result = SetValue;
+ if (DstSize != SetValueSize) {
+ if (!SetValueType->isIntegerTy() && !SetValueType->isFloatingPointTy()) {
+ // If the type cannot be put into a vector, bitcast to iN first.
+ LLVMContext &Ctx = SetValue->getContext();
+ Result = B.CreateBitCast(Result, Type::getIntNTy(Ctx, SetValueSize * 8),
+ "setvalue.toint");
+ }
+ // Form a sufficiently large vector consisting of SetValue, repeated.
+ Result =
+ B.CreateVectorSplat(DstSize / SetValueSize, Result, "setvalue.splat");
+ }
+
+ // The value has the right size, but we might have to bitcast it to the right
+ // type.
+ if (Result->getType() != DstType) {
+ Result = B.CreateBitCast(Result, DstType, "setvalue.splat.cast");
+ }
+ return Result;
+}
+
+static void createMemSetLoopKnownSize(Instruction *InsertBefore, Value *DstAddr,
+ ConstantInt *Len, Value *SetValue,
+ Align DstAlign, bool IsVolatile,
+ const TargetTransformInfo &TTI) {
+ // No need to expand zero length memsets.
+ if (Len->isZero())
+ return;
+
+ BasicBlock *PreLoopBB = InsertBefore->getParent();
+ Function *ParentFunc = PreLoopBB->getParent();
+ const DataLayout &DL = ParentFunc->getDataLayout();
+ LLVMContext &Ctx = PreLoopBB->getContext();
+
+ unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+ Type *TypeOfLen = Len->getType();
+ Type *Int8Type = Type::getInt8Ty(Ctx);
+ assert(SetValue->getType() == Int8Type && "Can only set bytes");
+
+ // Use the same memory access type as for a memcpy with the same Dst and Src
+ // alignment and address space.
+ Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+ Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt);
+ unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+
+ uint64_t LoopEndCount = alignDown(Len->getZExtValue(), LoopOpSize);
+
+ if (LoopEndCount != 0) {
+ Value *SplatSetValue = nullptr;
+ {
+ IRBuilder<> PreLoopBuilder(InsertBefore);
+ SplatSetValue =
+ createMemSetSplat(DL, PreLoopBuilder, SetValue, LoopOpType);
+ }
+
+ // Don't generate a residual loop, the remaining bytes are set with
+ // straight-line code.
+ LoopExpansionInfo LEI =
+ insertLoopExpansion(InsertBefore, Len, LoopOpSize, 0, "static-memset");
+
+ // Fill MainLoopBB
+ IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
+ Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
+
+ Value *DstGEP =
+ MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
+
+ MainLoopBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
+ IsVolatile);
+
+ assert(!LEI.ResidualLoopIP && !LEI.ResidualLoopIndex &&
+ "No residual loop was requested");
+ }
+
+ uint64_t BytesSet = LoopEndCount;
+ uint64_t RemainingBytes = Len->getZExtValue() - BytesSet;
+ if (RemainingBytes == 0)
+ return;
+
+ IRBuilder<> RBuilder(InsertBefore);
+
+ SmallVector<Type *, 5> RemainingOps;
+ TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+ DstAS, DstAS, DstAlign, DstAlign,
+ std::nullopt);
+
+ Type *PreviousOpTy = nullptr;
+ Value *SplatSetValue = nullptr;
+ for (auto *OpTy : RemainingOps) {
+ unsigned OperandSize = DL.getTypeStoreSize(OpTy);
+ Align PartDstAlign(commonAlignment(DstAlign, BytesSet));
+
+ // Avoid recomputing the splat SetValue if it's the same as for the last
+ // iteration.
+ if (OpTy != PreviousOpTy)
+ SplatSetValue = createMemSetSplat(DL, RBuilder, SetValue, OpTy);
+
+ Value *DstGEP = RBuilder.CreateInBoundsGEP(
+ Int8Type, DstAddr, ConstantInt::get(TypeOfLen, BytesSet));
+ RBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
+ IsVolatile);
+ BytesSet += OperandSize;
+ PreviousOpTy = OpTy;
+ }
+ assert(BytesSet == Len->getZExtValue() &&
+ "Bytes set should match size in the call!");
+}
+
+static void createMemSetLoopUnknownSize(Instruction *InsertBefore,
+ Value *DstAddr, Value *Len,
+ Value *SetValue, Align DstAlign,
+ bool IsVolatile,
+ const TargetTransformInfo &TTI) {
+ BasicBlock *PreLoopBB = InsertBefore->getParent();
+ Function *ParentFunc = PreLoopBB->getParent();
+ const DataLayout &DL = ParentFunc->getDataLayout();
+ LLVMContext &Ctx = PreLoopBB->getContext();
+
+ unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+ Type *Int8Type = Type::getInt8Ty(Ctx);
+ assert(SetValue->getType() == Int8Type && "Can only set bytes");
+
+ Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+ Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt);
+ unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+
+ Type *ResidualLoopOpType = Int8Type;
+ unsigned ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
+
+ Value *SplatSetValue = SetValue;
+ {
+ IRBuilder<> PreLoopBuilder(InsertBefore);
+ SplatSetValue = createMemSetSplat(DL, PreLoopBuilder, SetValue, LoopOpType);
+ }
+
+ LoopExpansionInfo LEI = insertLoopExpansion(
+ InsertBefore, Len, LoopOpSize, ResidualLoopOpSize, "dynamic-memset");
+
+ // Fill MainLoopBB
+ IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
+ Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
+
+ Value *DstGEP =
+ MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
+ MainLoopBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
+ IsVolatile);
+
+ // Fill ResidualLoopBB
+ if (!LEI.ResidualLoopIP)
+ return;
+
+ Align ResDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));
+
+ IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
+
+ Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
+ LEI.ResidualLoopIndex);
+ ResLoopBuilder.CreateAlignedStore(SetValue, ResDstGEP, ResDstAlign,
+ IsVolatile);
+}
+
static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
Value *CopyLen, Value *SetValue, Align DstAlign,
bool IsVolatile) {
+ // Currently no longer used for memset, only for memset.pattern.
+ // TODO: Update the memset.pattern lowering to also use the loop expansion
+ // framework and remove this function.
Type *TypeOfCopyLen = CopyLen->getType();
BasicBlock *OrigBB = InsertBefore->getParent();
Function *F = OrigBB->getParent();
@@ -1067,13 +1245,25 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
return true;
}
-void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
- createMemSetLoop(/* InsertBefore */ Memset,
- /* DstAddr */ Memset->getRawDest(),
- /* CopyLen */ Memset->getLength(),
- /* SetValue */ Memset->getValue(),
- /* Alignment */ Memset->getDestAlign().valueOrOne(),
- Memset->isVolatile());
+void llvm::expandMemSetAsLoop(MemSetInst *Memset,
+ const TargetTransformInfo &TTI) {
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Memset->getLength())) {
+ createMemSetLoopKnownSize(
+ /* InsertBefore */ Memset,
+ /* DstAddr */ Memset->getRawDest(),
+ /* Len */ CI,
+ /* SetValue */ Memset->getValue(),
+ /* DstAlign */ Memset->getDestAlign().valueOrOne(),
+ Memset->isVolatile(), TTI);
+ } else {
+ createMemSetLoopUnknownSize(
+ /* InsertBefore */ Memset,
+ /* DstAddr */ Memset->getRawDest(),
+ /* Len */ Memset->getLength(),
+ /* SetValue */ Memset->getValue(),
+ /* DstAlign */ Memset->getDestAlign().valueOrOne(),
+ Memset->isVolatile(), TTI);
+ }
}
void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
index 04652af147f9b..4d35f3198bc0a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
@@ -1,27 +1,87 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-- -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=35 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=37 %s -o - | FileCheck -check-prefix=UNROLL %s
declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1)
define amdgpu_cs void @memset_p1i8(ptr addrspace(1) %dst, i8 %val) {
; LOOP-LABEL: memset_p1i8:
-; LOOP: ; %bb.0: ; %loadstoreloop.preheader
+; LOOP: ; %bb.0:
+; LOOP-NEXT: v_and_b32_e32 v3, 0xff, v2
; LOOP-NEXT: s_mov_b64 s[0:1], 0
; LOOP-NEXT: s_mov_b32 s2, 0
; LOOP-NEXT: s_mov_b32 s3, 0xf000
+; LOOP-NEXT: v_lshlrev_b32_e32 v4, 8, v3
+; LOOP-NEXT: v_lshlrev_b32_e32 v5, 16, v3
+; LOOP-NEXT: v_lshlrev_b32_e32 v6, 24, v3
+; LOOP-NEXT: v_or_b32_e32 v3, v3, v4
+; LOOP-NEXT: v_or_b32_e32 v3, v3, v5
+; LOOP-NEXT: v_or_b32_e32 v5, v3, v6
+; LOOP-NEXT: v_lshrrev_b32_e32 v6, 16, v5
+; LOOP-NEXT: v_bfe_u32 v7, v5, 8, 8
+; LOOP-NEXT: v_lshrrev_b32_e32 v8, 24, v5
; LOOP-NEXT: v_mov_b32_e32 v4, s1
; LOOP-NEXT: v_mov_b32_e32 v3, s0
-; LOOP-NEXT: .LBB0_1: ; %loadstoreloop
+; LOOP-NEXT: .LBB0_1: ; %static-memset-expansion-main-body
; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1
-; LOOP-NEXT: v_add_i32_e32 v5, vcc, v0, v3
-; LOOP-NEXT: v_addc_u32_e32 v6, vcc, v1, v4, vcc
-; LOOP-NEXT: v_add_i32_e32 v3, vcc, 1, v3
+; LOOP-NEXT: v_add_i32_e32 v9, vcc, v0, v3
+; LOOP-NEXT: v_addc_u32_e32 v10, vcc, v1, v4, vcc
+; LOOP-NEXT: v_add_i32_e32 v3, vcc, 32, v3
; LOOP-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; LOOP-NEXT: v_cmp_gt_u32_e32 vcc, 4, v3
-; LOOP-NEXT: buffer_store_byte v2, v[5:6], s[0:3], 0 addr64
+; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64
+; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:2
+; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:4
+; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:5
+; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:6
+; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:7
+; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:8
+; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:9
+; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:10
+; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:11
+; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:12
+; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:13
+; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:14
+; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:15
+; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:16
+; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:17
+; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:18
+; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:19
+; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:20
+; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:21
+; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:22
+; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:23
+; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:24
+; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:25
+; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:26
+; LOOP-NEXT: buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:27
+; LOOP-NEXT: buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:28
+; LOOP-NEXT: buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:29
+; LOOP-NEXT: buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:30
+; LOOP-NEXT: v_cmp_gt_u32_e32 vcc, 32, v3
+; LOOP-NE...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/169040
More information about the llvm-branch-commits
mailing list