[llvm-branch-commits] [llvm] [LowerMemIntrinsics] Optimize memset lowering (PR #169040)

Fri Nov 21 05:57:11 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Fabian Ritter (ritter-x2a)

<details>
<summary>Changes</summary>

This patch changes the memset lowering to match the optimized memcpy lowering.
The memset lowering now queries TTI.getMemcpyLoopLoweringType for a preferred
memory access type. If that type is larger than a byte, the memset is lowered
into two loops: a main loop that stores a sufficiently wide vector splat of the
SetValue with the preferred memory access type and a residual loop that covers
the remaining bytes individually. If the memset size is statically known, the
residual loop is replaced by a sequence of stores.

This improves memset performance on gfx1030 (AMDGPU) in microbenchmarks by
around 7-20x.

I'm planning similar treatment for memset.pattern as a follow-up PR.

For SWDEV-543208.

---

Patch is 343.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169040.diff


17 Files Affected:

- (modified) llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h (+2-1) 
- (modified) llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp (+4-2) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp (+2-1) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp (+2-1) 
- (modified) llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp (+1-1) 
- (modified) llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp (+7-4) 
- (modified) llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp (+197-7) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll (+103-11) 
- (modified) llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll (+686-90) 
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-mem-transfer.ll (+218-116) 
- (modified) llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll (+15-36) 
- (modified) llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll (+55-13) 
- (modified) llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll (+1616) 
- (added) llvm/test/CodeGen/AMDGPU/memset-param-combinations.ll (+1900) 
- (modified) llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll (+4-4) 
- (modified) llvm/test/CodeGen/SPIRV/llvm-intrinsics/memset.ll (+2-2) 
- (modified) llvm/test/Transforms/PreISelIntrinsicLowering/X86/memset-inline-non-constant-len.ll (+12-12) 


``````````diff

diff --git a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
index d4e72a60fc1ea..8924b8b1e6e54 100644
--- a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
+++ b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
@@ -59,7 +59,8 @@ LLVM_ABI bool expandMemMoveAsLoop(MemMoveInst *MemMove,
                                   const TargetTransformInfo &TTI);
 
 /// Expand \p MemSet as a loop. \p MemSet is not deleted.
-LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet);
+LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet,
+                                 const TargetTransformInfo &TTI);
 
 /// Expand \p MemSetPattern as a loop. \p MemSet is not deleted.
 LLVM_ABI void expandMemSetPatternAsLoop(MemSetPatternInst *MemSet);
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index d738dc4eea36d..88e2bb81f9e3b 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -369,7 +369,7 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
             canEmitLibcall(TM, ParentFunc, RTLIB::MEMSET))
           break;
 
-        expandMemSetAsLoop(Memset);
+        expandMemSetAsLoop(Memset, TTI);
         Changed = true;
         Memset->eraseFromParent();
       }
@@ -384,7 +384,9 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
       if (isa<ConstantInt>(Memset->getLength()))
         break;
 
-      expandMemSetAsLoop(Memset);
+      Function *ParentFunc = Memset->getFunction();
+      const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
+      expandMemSetAsLoop(Memset, TTI);
       Changed = true;
       Memset->eraseFromParent();
       break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index fdff21b6ef8df..76f1e006bbf74 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -635,7 +635,8 @@ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetInst(
     MemSetInst &MSI) {
   if (MSI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
     return false;
-  llvm::expandMemSetAsLoop(&MSI);
+  llvm::expandMemSetAsLoop(&MSI,
+                           TM->getTargetTransformInfo(*MSI.getFunction()));
   MSI.eraseFromParent();
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 03d16fdd54c42..5a68dca1b10b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -80,7 +80,8 @@ static cl::opt<size_t> InlineMaxBB(
 static cl::opt<unsigned> MemcpyLoopUnroll(
     "amdgpu-memcpy-loop-unroll",
     cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
-             "operations when lowering memcpy as a loop"),
+             "operations when lowering statically-sized memcpy, memmove, or"
+             "memset as a loop"),
     cl::init(16), cl::Hidden);
 
 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index ac6f4061b9f1f..d0b50d2610bd5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -128,7 +128,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
     } else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) {
       expandMemMoveAsLoop(Memmove, TTI);
     } else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) {
-      expandMemSetAsLoop(Memset);
+      expandMemSetAsLoop(Memset, TTI);
     }
     MemCall->eraseFromParent();
   }
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
index be88f334d2171..8bee6da75cc75 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -23,6 +23,7 @@
 #include "SPIRVTargetMachine.h"
 #include "SPIRVUtils.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/IR/IRBuilder.h"
@@ -93,7 +94,8 @@ static Function *getOrCreateFunction(Module *M, Type *RetTy,
   return NewF;
 }
 
-static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic) {
+static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic,
+                                     const TargetTransformInfo &TTI) {
   // For @llvm.memset.* intrinsic cases with constant value and length arguments
   // are emulated via "storing" a constant array to the destination. For other
   // cases we wrap the intrinsic in @spirv.llvm_memset_* function and expand the
@@ -137,7 +139,7 @@ static bool lowerIntrinsicToFunction(IntrinsicInst *Intrinsic) {
     auto *MemSet = IRB.CreateMemSet(Dest, Val, Len, MSI->getDestAlign(),
                                     MSI->isVolatile());
     IRB.CreateRetVoid();
-    expandMemSetAsLoop(cast<MemSetInst>(MemSet));
+    expandMemSetAsLoop(cast<MemSetInst>(MemSet), TTI);
     MemSet->eraseFromParent();
     break;
   }
@@ -399,6 +401,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
   bool Changed = false;
   const SPIRVSubtarget &STI = TM.getSubtarget<SPIRVSubtarget>(*F);
   SmallVector<Instruction *> EraseFromParent;
+  const TargetTransformInfo &TTI = TM.getTargetTransformInfo(*F);
   for (BasicBlock &BB : *F) {
     for (Instruction &I : make_early_inc_range(BB)) {
       auto Call = dyn_cast<CallInst>(&I);
@@ -411,7 +414,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
       switch (II->getIntrinsicID()) {
       case Intrinsic::memset:
       case Intrinsic::bswap:
-        Changed |= lowerIntrinsicToFunction(II);
+        Changed |= lowerIntrinsicToFunction(II, TTI);
         break;
       case Intrinsic::fshl:
       case Intrinsic::fshr:
@@ -459,7 +462,7 @@ bool SPIRVPrepareFunctions::substituteIntrinsicCalls(Function *F) {
                 return false;
               return II->getCalledFunction()->getName().starts_with(Prefix);
             }))
-          Changed |= lowerIntrinsicToFunction(II);
+          Changed |= lowerIntrinsicToFunction(II, TTI);
         break;
       }
     }
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 88e5d038bff82..07bfceb99d206 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -930,9 +930,187 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
   }
 }
 
+/// Create a Value of \p DstType that consists of a sequence of copies of
+/// \p SetValue, using bitcasts and a vector splat.
+static Value *createMemSetSplat(const DataLayout &DL, IRBuilderBase &B,
+                                Value *SetValue, Type *DstType) {
+  unsigned DstSize = DL.getTypeStoreSize(DstType);
+  Type *SetValueType = SetValue->getType();
+  unsigned SetValueSize = DL.getTypeStoreSize(SetValueType);
+  assert(SetValueSize == DL.getTypeAllocSize(SetValueType) &&
+         "Store size and alloc size of SetValue's type must match");
+  assert(SetValueSize != 0 && DstSize % SetValueSize == 0 &&
+         "DstType size must be a multiple of SetValue size");
+
+  Value *Result = SetValue;
+  if (DstSize != SetValueSize) {
+    if (!SetValueType->isIntegerTy() && !SetValueType->isFloatingPointTy()) {
+      // If the type cannot be put into a vector, bitcast to iN first.
+      LLVMContext &Ctx = SetValue->getContext();
+      Result = B.CreateBitCast(Result, Type::getIntNTy(Ctx, SetValueSize * 8),
+                               "setvalue.toint");
+    }
+    // Form a sufficiently large vector consisting of SetValue, repeated.
+    Result =
+        B.CreateVectorSplat(DstSize / SetValueSize, Result, "setvalue.splat");
+  }
+
+  // The value has the right size, but we might have to bitcast it to the right
+  // type.
+  if (Result->getType() != DstType) {
+    Result = B.CreateBitCast(Result, DstType, "setvalue.splat.cast");
+  }
+  return Result;
+}
+
+static void createMemSetLoopKnownSize(Instruction *InsertBefore, Value *DstAddr,
+                                      ConstantInt *Len, Value *SetValue,
+                                      Align DstAlign, bool IsVolatile,
+                                      const TargetTransformInfo &TTI) {
+  // No need to expand zero length memsets.
+  if (Len->isZero())
+    return;
+
+  BasicBlock *PreLoopBB = InsertBefore->getParent();
+  Function *ParentFunc = PreLoopBB->getParent();
+  const DataLayout &DL = ParentFunc->getDataLayout();
+  LLVMContext &Ctx = PreLoopBB->getContext();
+
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  Type *TypeOfLen = Len->getType();
+  Type *Int8Type = Type::getInt8Ty(Ctx);
+  assert(SetValue->getType() == Int8Type && "Can only set bytes");
+
+  // Use the same memory access type as for a memcpy with the same Dst and Src
+  // alignment and address space.
+  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+      Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt);
+  unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+
+  uint64_t LoopEndCount = alignDown(Len->getZExtValue(), LoopOpSize);
+
+  if (LoopEndCount != 0) {
+    Value *SplatSetValue = nullptr;
+    {
+      IRBuilder<> PreLoopBuilder(InsertBefore);
+      SplatSetValue =
+          createMemSetSplat(DL, PreLoopBuilder, SetValue, LoopOpType);
+    }
+
+    // Don't generate a residual loop, the remaining bytes are set with
+    // straight-line code.
+    LoopExpansionInfo LEI =
+        insertLoopExpansion(InsertBefore, Len, LoopOpSize, 0, "static-memset");
+
+    // Fill MainLoopBB
+    IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
+    Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
+
+    Value *DstGEP =
+        MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
+
+    MainLoopBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
+                                       IsVolatile);
+
+    assert(!LEI.ResidualLoopIP && !LEI.ResidualLoopIndex &&
+           "No residual loop was requested");
+  }
+
+  uint64_t BytesSet = LoopEndCount;
+  uint64_t RemainingBytes = Len->getZExtValue() - BytesSet;
+  if (RemainingBytes == 0)
+    return;
+
+  IRBuilder<> RBuilder(InsertBefore);
+
+  SmallVector<Type *, 5> RemainingOps;
+  TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+                                        DstAS, DstAS, DstAlign, DstAlign,
+                                        std::nullopt);
+
+  Type *PreviousOpTy = nullptr;
+  Value *SplatSetValue = nullptr;
+  for (auto *OpTy : RemainingOps) {
+    unsigned OperandSize = DL.getTypeStoreSize(OpTy);
+    Align PartDstAlign(commonAlignment(DstAlign, BytesSet));
+
+    // Avoid recomputing the splat SetValue if it's the same as for the last
+    // iteration.
+    if (OpTy != PreviousOpTy)
+      SplatSetValue = createMemSetSplat(DL, RBuilder, SetValue, OpTy);
+
+    Value *DstGEP = RBuilder.CreateInBoundsGEP(
+        Int8Type, DstAddr, ConstantInt::get(TypeOfLen, BytesSet));
+    RBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
+                                IsVolatile);
+    BytesSet += OperandSize;
+    PreviousOpTy = OpTy;
+  }
+  assert(BytesSet == Len->getZExtValue() &&
+         "Bytes set should match size in the call!");
+}
+
+static void createMemSetLoopUnknownSize(Instruction *InsertBefore,
+                                        Value *DstAddr, Value *Len,
+                                        Value *SetValue, Align DstAlign,
+                                        bool IsVolatile,
+                                        const TargetTransformInfo &TTI) {
+  BasicBlock *PreLoopBB = InsertBefore->getParent();
+  Function *ParentFunc = PreLoopBB->getParent();
+  const DataLayout &DL = ParentFunc->getDataLayout();
+  LLVMContext &Ctx = PreLoopBB->getContext();
+
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  Type *Int8Type = Type::getInt8Ty(Ctx);
+  assert(SetValue->getType() == Int8Type && "Can only set bytes");
+
+  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+      Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt);
+  unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+
+  Type *ResidualLoopOpType = Int8Type;
+  unsigned ResidualLoopOpSize = DL.getTypeStoreSize(ResidualLoopOpType);
+
+  Value *SplatSetValue = SetValue;
+  {
+    IRBuilder<> PreLoopBuilder(InsertBefore);
+    SplatSetValue = createMemSetSplat(DL, PreLoopBuilder, SetValue, LoopOpType);
+  }
+
+  LoopExpansionInfo LEI = insertLoopExpansion(
+      InsertBefore, Len, LoopOpSize, ResidualLoopOpSize, "dynamic-memset");
+
+  // Fill MainLoopBB
+  IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
+  Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
+
+  Value *DstGEP =
+      MainLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr, LEI.MainLoopIndex);
+  MainLoopBuilder.CreateAlignedStore(SplatSetValue, DstGEP, PartDstAlign,
+                                     IsVolatile);
+
+  // Fill ResidualLoopBB
+  if (!LEI.ResidualLoopIP)
+    return;
+
+  Align ResDstAlign(commonAlignment(PartDstAlign, ResidualLoopOpSize));
+
+  IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
+
+  Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(Int8Type, DstAddr,
+                                                      LEI.ResidualLoopIndex);
+  ResLoopBuilder.CreateAlignedStore(SetValue, ResDstGEP, ResDstAlign,
+                                    IsVolatile);
+}
+
 static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
                              Value *CopyLen, Value *SetValue, Align DstAlign,
                              bool IsVolatile) {
+  // Currently no longer used for memset, only for memset.pattern.
+  // TODO: Update the memset.pattern lowering to also use the loop expansion
+  //       framework and remove this function.
   Type *TypeOfCopyLen = CopyLen->getType();
   BasicBlock *OrigBB = InsertBefore->getParent();
   Function *F = OrigBB->getParent();
@@ -1067,13 +1245,25 @@ bool llvm::expandMemMoveAsLoop(MemMoveInst *Memmove,
   return true;
 }
 
-void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
-  createMemSetLoop(/* InsertBefore */ Memset,
-                   /* DstAddr */ Memset->getRawDest(),
-                   /* CopyLen */ Memset->getLength(),
-                   /* SetValue */ Memset->getValue(),
-                   /* Alignment */ Memset->getDestAlign().valueOrOne(),
-                   Memset->isVolatile());
+void llvm::expandMemSetAsLoop(MemSetInst *Memset,
+                              const TargetTransformInfo &TTI) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Memset->getLength())) {
+    createMemSetLoopKnownSize(
+        /* InsertBefore */ Memset,
+        /* DstAddr */ Memset->getRawDest(),
+        /* Len */ CI,
+        /* SetValue */ Memset->getValue(),
+        /* DstAlign */ Memset->getDestAlign().valueOrOne(),
+        Memset->isVolatile(), TTI);
+  } else {
+    createMemSetLoopUnknownSize(
+        /* InsertBefore */ Memset,
+        /* DstAddr */ Memset->getRawDest(),
+        /* Len */ Memset->getLength(),
+        /* SetValue */ Memset->getValue(),
+        /* DstAlign */ Memset->getDestAlign().valueOrOne(),
+        Memset->isVolatile(), TTI);
+  }
 }
 
 void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
index 04652af147f9b..4d35f3198bc0a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
@@ -1,27 +1,87 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-- -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=35  %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=37 %s -o - | FileCheck -check-prefix=UNROLL %s
 
 declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1)
 
 define amdgpu_cs void @memset_p1i8(ptr addrspace(1) %dst, i8 %val) {
 ; LOOP-LABEL: memset_p1i8:
-; LOOP:       ; %bb.0: ; %loadstoreloop.preheader
+; LOOP:       ; %bb.0:
+; LOOP-NEXT:    v_and_b32_e32 v3, 0xff, v2
 ; LOOP-NEXT:    s_mov_b64 s[0:1], 0
 ; LOOP-NEXT:    s_mov_b32 s2, 0
 ; LOOP-NEXT:    s_mov_b32 s3, 0xf000
+; LOOP-NEXT:    v_lshlrev_b32_e32 v4, 8, v3
+; LOOP-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; LOOP-NEXT:    v_lshlrev_b32_e32 v6, 24, v3
+; LOOP-NEXT:    v_or_b32_e32 v3, v3, v4
+; LOOP-NEXT:    v_or_b32_e32 v3, v3, v5
+; LOOP-NEXT:    v_or_b32_e32 v5, v3, v6
+; LOOP-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; LOOP-NEXT:    v_bfe_u32 v7, v5, 8, 8
+; LOOP-NEXT:    v_lshrrev_b32_e32 v8, 24, v5
 ; LOOP-NEXT:    v_mov_b32_e32 v4, s1
 ; LOOP-NEXT:    v_mov_b32_e32 v3, s0
-; LOOP-NEXT:  .LBB0_1: ; %loadstoreloop
+; LOOP-NEXT:  .LBB0_1: ; %static-memset-expansion-main-body
 ; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; LOOP-NEXT:    v_add_i32_e32 v5, vcc, v0, v3
-; LOOP-NEXT:    v_addc_u32_e32 v6, vcc, v1, v4, vcc
-; LOOP-NEXT:    v_add_i32_e32 v3, vcc, 1, v3
+; LOOP-NEXT:    v_add_i32_e32 v9, vcc, v0, v3
+; LOOP-NEXT:    v_addc_u32_e32 v10, vcc, v1, v4, vcc
+; LOOP-NEXT:    v_add_i32_e32 v3, vcc, 32, v3
 ; LOOP-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; LOOP-NEXT:    v_cmp_gt_u32_e32 vcc, 4, v3
-; LOOP-NEXT:    buffer_store_byte v2, v[5:6], s[0:3], 0 addr64
+; LOOP-NEXT:    buffer_store_byte v5, v[9:10], s[0:3], 0 addr64
+; LOOP-NEXT:    buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT:    buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:2
+; LOOP-NEXT:    buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT:    buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:4
+; LOOP-NEXT:    buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:5
+; LOOP-NEXT:    buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:6
+; LOOP-NEXT:    buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:7
+; LOOP-NEXT:    buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:8
+; LOOP-NEXT:    buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:9
+; LOOP-NEXT:    buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:10
+; LOOP-NEXT:    buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:11
+; LOOP-NEXT:    buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:12
+; LOOP-NEXT:    buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:13
+; LOOP-NEXT:    buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:14
+; LOOP-NEXT:    buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:15
+; LOOP-NEXT:    buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:16
+; LOOP-NEXT:    buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:17
+; LOOP-NEXT:    buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:18
+; LOOP-NEXT:    buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:19
+; LOOP-NEXT:    buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:20
+; LOOP-NEXT:    buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:21
+; LOOP-NEXT:    buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:22
+; LOOP-NEXT:    buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:23
+; LOOP-NEXT:    buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:24
+; LOOP-NEXT:    buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:25
+; LOOP-NEXT:    buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:26
+; LOOP-NEXT:    buffer_store_byte v8, v[9:10], s[0:3], 0 addr64 offset:27
+; LOOP-NEXT:    buffer_store_byte v5, v[9:10], s[0:3], 0 addr64 offset:28
+; LOOP-NEXT:    buffer_store_byte v7, v[9:10], s[0:3], 0 addr64 offset:29
+; LOOP-NEXT:    buffer_store_byte v6, v[9:10], s[0:3], 0 addr64 offset:30
+; LOOP-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v3
+; LOOP-NE...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/169040