[llvm] [LoopIdiom] Select llvm.experimental.memset.pattern intrinsic rather than memset_pattern16 libcall (PR #126736)
Alex Bradbury via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 11 21:16:29 PDT 2025
https://github.com/asb updated https://github.com/llvm/llvm-project/pull/126736
>From a052059d9c2ef5200b8206293deb84b0643f96bf Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Tue, 11 Feb 2025 09:44:04 +0000
Subject: [PATCH 1/5] [LoopIdiom] Select llvm.experimental.memset.pattern
intrinsic rather than memset_pattern16 libcall
In order to keep the change as incremental as possible, this only
introduces the memset.pattern intrinsic in cases where memset_pattern16
would have been used. Future patches can enable it on targets that don't
have the intrinsic. As the memset.pattern intrinsic takes the number of
times to store the pattern as an argument unlike memset_pattern16 which
takes the number of bytes to write, we no longer try to form an i128
pattern.
Special care is taken for cases where multiple stores in the
same loop iteration were combined to form a single pattern. For such
cases, we inherit the limitation that loops such as the following are
supported:
```
for (unsigned i = 0; i < 2 * n; i += 2) {
f[i] = 2;
f[i+1] = 2;
}
```
But the following doesn't result in a memset.pattern (even though it
could be, by forming an appropriate pattern):
```
for (unsigned i = 0; i < 2 * n; i += 2) {
f[i] = 2;
f[i+1] = 3;
}
```
Addressing this existing deficiency is left for a follow-up due to a
desire not to change too much at once (i.e. to target equivalence to the
current codegen).
A command line option is introduced to force the selection of the
intrinsic even in cases it wouldn't be (i.e. in cases where the libcall
wouldn't have been selected). This is intended as a transitionary option
for testing and experimentation, to be removed at a later point.
---
.../Transforms/Scalar/LoopIdiomRecognize.cpp | 136 ++++++++++++------
.../LoopIdiom/RISCV/memset-pattern.ll | 49 +++++++
llvm/test/Transforms/LoopIdiom/basic.ll | 11 +-
.../LoopIdiom/memset-pattern-tbaa.ll | 16 +--
.../Transforms/LoopIdiom/struct_pattern.ll | 18 +--
.../Transforms/LoopIdiom/unroll-custom-dl.ll | 10 +-
llvm/test/Transforms/LoopIdiom/unroll.ll | 10 +-
7 files changed, 166 insertions(+), 84 deletions(-)
create mode 100644 llvm/test/Transforms/LoopIdiom/RISCV/memset-pattern.ll
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 2462ec33e0c20..58c12298ca926 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -132,6 +132,11 @@ static cl::opt<bool> UseLIRCodeSizeHeurs(
"with -Os/-Oz"),
cl::init(true), cl::Hidden);
+static cl::opt<bool> ForceMemsetPatternIntrinsic(
+ "loop-idiom-force-memset-pattern-intrinsic",
+ cl::desc("Enable use of the memset.pattern intrinsic"), cl::init(false),
+ cl::Hidden);
+
namespace {
class LoopIdiomRecognize {
@@ -303,10 +308,15 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
HasMemset = TLI->has(LibFunc_memset);
+ // TODO: Unconditionally enable use of the memset pattern intrinsic (or at
+ // least, opt-in via target hook) once we are confident it will never result
+ // in worse codegen than without. For now, use it only when we would have
+ // previously emitted a libcall to memset_pattern16 (or unless this is
+ // overridden by command line option).
HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
HasMemcpy = TLI->has(LibFunc_memcpy);
- if (HasMemset || HasMemsetPattern || HasMemcpy)
+ if (HasMemset || HasMemsetPattern || ForceMemsetPatternIntrinsic || HasMemcpy)
if (SE->hasLoopInvariantBackedgeTakenCount(L))
return runOnCountableLoop();
@@ -392,14 +402,7 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
if (Size > 16)
return nullptr;
- // If the constant is exactly 16 bytes, just use it.
- if (Size == 16)
- return C;
-
- // Otherwise, we'll use an array of the constants.
- unsigned ArraySize = 16 / Size;
- ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
- return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
+ return C;
}
LoopIdiomRecognize::LegalStoreKind
@@ -463,8 +466,9 @@ LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
// It looks like we can use SplatValue.
return LegalStoreKind::Memset;
}
- if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
- // Don't create memset_pattern16s with address spaces.
+ if (!UnorderedAtomic && (HasMemsetPattern || ForceMemsetPatternIntrinsic) &&
+ !DisableLIRP::Memset &&
+ // Don't create memset.pattern intrinsic calls with address spaces.
StorePtr->getType()->getPointerAddressSpace() == 0 &&
getMemSetPatternValue(StoredVal, DL)) {
// It looks like we can use PatternValue!
@@ -1064,53 +1068,101 @@ bool LoopIdiomRecognize::processLoopStridedStore(
return Changed;
// Okay, everything looks good, insert the memset.
+ // MemsetArg is the number of bytes for the memset libcall, and the number
+ // of pattern repetitions if the memset.pattern intrinsic is being used.
+ Value *MemsetArg;
+ std::optional<int64_t> BytesWritten = std::nullopt;
+
+ if (PatternValue && (HasMemsetPattern || ForceMemsetPatternIntrinsic)) {
+ const SCEV *TripCountS =
+ SE->getTripCountFromExitCount(BECount, IntIdxTy, CurLoop);
+ if (!Expander.isSafeToExpand(TripCountS))
+ return Changed;
+ const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
+ if (!ConstStoreSize)
+ return Changed;
+ Value *TripCount = Expander.expandCodeFor(TripCountS, IntIdxTy,
+ Preheader->getTerminator());
+ uint64_t PatternRepsPerTrip =
+ (ConstStoreSize->getValue()->getZExtValue() * 8) /
+ DL->getTypeSizeInBits(PatternValue->getType());
+ // If ConstStoreSize is not equal to the width of PatternValue, then
+ // MemsetArg is TripCount * (ConstStoreSize/PatternValueWidth). Else
+ // MemSetArg is just TripCount.
+ MemsetArg =
+ PatternRepsPerTrip == 1
+ ? TripCount
+ : Builder.CreateMul(TripCount,
+ Builder.getIntN(IntIdxTy->getIntegerBitWidth(),
+ PatternRepsPerTrip));
+ if (auto CI = dyn_cast<ConstantInt>(TripCount))
+ BytesWritten =
+ CI->getZExtValue() * ConstStoreSize->getValue()->getZExtValue();
+ } else {
+ const SCEV *NumBytesS =
+ getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
- const SCEV *NumBytesS =
- getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
-
- // TODO: ideally we should still be able to generate memset if SCEV expander
- // is taught to generate the dependencies at the latest point.
- if (!Expander.isSafeToExpand(NumBytesS))
- return Changed;
-
- Value *NumBytes =
- Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
+ // TODO: ideally we should still be able to generate memset if SCEV expander
+ // is taught to generate the dependencies at the latest point.
+ if (!Expander.isSafeToExpand(NumBytesS))
+ return Changed;
+ MemsetArg =
+ Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
+ if (auto CI = dyn_cast<ConstantInt>(MemsetArg))
+ BytesWritten = CI->getZExtValue();
+ }
+ assert(MemsetArg && "MemsetArg should have been set");
- if (!SplatValue && !isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16))
+ if (!SplatValue && !(ForceMemsetPatternIntrinsic ||
+ isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16)))
return Changed;
AAMDNodes AATags = TheStore->getAAMetadata();
for (Instruction *Store : Stores)
AATags = AATags.merge(Store->getAAMetadata());
- if (auto CI = dyn_cast<ConstantInt>(NumBytes))
- AATags = AATags.extendTo(CI->getZExtValue());
+ if (BytesWritten)
+ AATags = AATags.extendTo(BytesWritten.value());
else
AATags = AATags.extendTo(-1);
CallInst *NewCall;
if (SplatValue) {
NewCall = Builder.CreateMemSet(
- BasePtr, SplatValue, NumBytes, MaybeAlign(StoreAlignment),
+ BasePtr, SplatValue, MemsetArg, MaybeAlign(StoreAlignment),
/*isVolatile=*/false, AATags.TBAA, AATags.Scope, AATags.NoAlias);
} else {
- assert (isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16));
+ assert(ForceMemsetPatternIntrinsic ||
+ isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16));
// Everything is emitted in default address space
- Type *Int8PtrTy = DestInt8PtrTy;
-
- StringRef FuncName = "memset_pattern16";
- FunctionCallee MSP = getOrInsertLibFunc(M, *TLI, LibFunc_memset_pattern16,
- Builder.getVoidTy(), Int8PtrTy, Int8PtrTy, IntIdxTy);
- inferNonMandatoryLibFuncAttrs(M, FuncName, *TLI);
-
- // Otherwise we should form a memset_pattern16. PatternValue is known to be
- // an constant array of 16-bytes. Plop the value into a mergable global.
- GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
- GlobalValue::PrivateLinkage,
- PatternValue, ".memset_pattern");
- GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
- GV->setAlignment(Align(16));
- Value *PatternPtr = GV;
- NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
+
+ assert(isa<SCEVConstant>(StoreSizeSCEV) && "Expected constant store size");
+
+ Value *PatternArg;
+ IntegerType *PatternArgTy =
+ Builder.getIntNTy(DL->getTypeSizeInBits(PatternValue->getType()));
+
+ // If the pattern value can be casted directly to an integer argument, use
+ // that. Otherwise (e.g. if the value is a global pointer), create a
+ // GlobalVariable and load from it.
+ if (isa<ConstantInt>(PatternValue)) {
+ PatternArg = PatternValue;
+ } else if (isa<ConstantFP>(PatternValue)) {
+ PatternArg = Builder.CreateBitCast(PatternValue, PatternArgTy);
+ } else {
+ GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
+ GlobalValue::PrivateLinkage,
+ PatternValue, ".memset_pattern");
+ GV->setUnnamedAddr(
+ GlobalValue::UnnamedAddr::Global); // Ok to merge these.
+ GV->setAlignment(Align(PatternArgTy->getPrimitiveSizeInBits()));
+ PatternArg = Builder.CreateLoad(PatternArgTy, GV);
+ }
+ assert(PatternArg);
+
+ NewCall = Builder.CreateIntrinsic(Intrinsic::experimental_memset_pattern,
+ {DestInt8PtrTy, PatternArgTy, IntIdxTy},
+ {BasePtr, PatternArg, MemsetArg,
+ ConstantInt::getFalse(M->getContext())});
// Set the TBAA info if present.
if (AATags.TBAA)
diff --git a/llvm/test/Transforms/LoopIdiom/RISCV/memset-pattern.ll b/llvm/test/Transforms/LoopIdiom/RISCV/memset-pattern.ll
new file mode 100644
index 0000000000000..b3cee756076af
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/RISCV/memset-pattern.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; RUN: opt -passes=loop-idiom -mtriple=riscv64 < %s -S | FileCheck %s
+; RUN: opt -passes=loop-idiom -mtriple=riscv64 -loop-idiom-force-memset-pattern-intrinsic < %s -S \
+; RUN: | FileCheck -check-prefix=CHECK-INTRIN %s
+
+define dso_local void @double_memset(ptr nocapture %p) {
+; CHECK-LABEL: @double_memset(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup:
+; CHECK-NEXT: ret void
+; CHECK: for.body:
+; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P:%.*]], i64 [[I_07]]
+; CHECK-NEXT: store double 3.141590e+00, ptr [[PTR1]], align 1
+; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+;
+; CHECK-INTRIN-LABEL: @double_memset(
+; CHECK-INTRIN-NEXT: entry:
+; CHECK-INTRIN-NEXT: call void @llvm.experimental.memset.pattern.p0.i64.i64(ptr [[P:%.*]], i64 4614256650576692846, i64 16, i1 false)
+; CHECK-INTRIN-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-INTRIN: for.cond.cleanup:
+; CHECK-INTRIN-NEXT: ret void
+; CHECK-INTRIN: for.body:
+; CHECK-INTRIN-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-INTRIN-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[I_07]]
+; CHECK-INTRIN-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
+; CHECK-INTRIN-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 16
+; CHECK-INTRIN-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+;
+entry:
+ br label %for.body
+
+for.cond.cleanup:
+ ret void
+
+for.body:
+ %i.07 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+ %ptr1 = getelementptr inbounds double, ptr %p, i64 %i.07
+ store double 3.14159e+00, ptr %ptr1, align 1
+ %inc = add nuw nsw i64 %i.07, 1
+ %exitcond.not = icmp eq i64 %inc, 16
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+;.
+; CHECK-INTRIN: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+;.
diff --git a/llvm/test/Transforms/LoopIdiom/basic.ll b/llvm/test/Transforms/LoopIdiom/basic.ll
index e6fc11625317b..0fe8cd747408f 100644
--- a/llvm/test/Transforms/LoopIdiom/basic.ll
+++ b/llvm/test/Transforms/LoopIdiom/basic.ll
@@ -7,8 +7,7 @@ target triple = "x86_64-apple-darwin10.0.0"
;.
; CHECK: @G = global i32 5
; CHECK: @g_50 = global [7 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0], align 16
-; CHECK: @.memset_pattern = private unnamed_addr constant [4 x i32] [i32 1, i32 1, i32 1, i32 1], align 16
-; CHECK: @.memset_pattern.1 = private unnamed_addr constant [2 x ptr] [ptr @G, ptr @G], align 16
+; CHECK: @.memset_pattern = private unnamed_addr constant ptr @G, align 64
;.
define void @test1(ptr %Base, i64 %Size) nounwind ssp {
; CHECK-LABEL: @test1(
@@ -533,7 +532,7 @@ for.end13: ; preds = %for.inc10
define void @test11_pattern(ptr nocapture %P) nounwind ssp {
; CHECK-LABEL: @test11_pattern(
; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @memset_pattern16(ptr [[P:%.*]], ptr @.memset_pattern, i64 40000)
+; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr [[P:%.*]], i32 1, i64 10000, i1 false)
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ]
@@ -596,7 +595,8 @@ for.end: ; preds = %for.body
define void @test13_pattern(ptr nocapture %P) nounwind ssp {
; CHECK-LABEL: @test13_pattern(
; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @memset_pattern16(ptr [[P:%.*]], ptr @.memset_pattern.1, i64 80000)
+; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @.memset_pattern, align 8
+; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i64.i64(ptr [[P:%.*]], i64 [[TMP0]], i64 10000, i1 false)
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ]
@@ -1625,6 +1625,5 @@ define noalias ptr @_ZN8CMSPULog9beginImplEja(ptr nocapture writeonly %0) local_
; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { nofree nounwind willreturn memory(argmem: readwrite) }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;.
diff --git a/llvm/test/Transforms/LoopIdiom/memset-pattern-tbaa.ll b/llvm/test/Transforms/LoopIdiom/memset-pattern-tbaa.ll
index 57a91a3bf6e2c..98521ef82fbe7 100644
--- a/llvm/test/Transforms/LoopIdiom/memset-pattern-tbaa.ll
+++ b/llvm/test/Transforms/LoopIdiom/memset-pattern-tbaa.ll
@@ -6,15 +6,10 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
target triple = "x86_64-apple-darwin10.0.0"
-;.
-; CHECK: @.memset_pattern = private unnamed_addr constant [2 x double] [double 3.141590e+00, double 3.141590e+00], align 16
-; CHECK: @.memset_pattern.1 = private unnamed_addr constant [2 x double] [double 3.141590e+00, double 3.141590e+00], align 16
-; CHECK: @.memset_pattern.2 = private unnamed_addr constant [2 x double] [double 3.141590e+00, double 3.141590e+00], align 16
-;.
define dso_local void @double_memset(ptr nocapture %p) {
; CHECK-LABEL: @double_memset(
; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @memset_pattern16(ptr [[P:%.*]], ptr @.memset_pattern, i64 128), !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i64.i64(ptr [[P:%.*]], i64 4614256650576692846, i64 16, i1 false), !tbaa [[TBAA0:![0-9]+]]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
@@ -44,7 +39,7 @@ for.body:
define dso_local void @struct_memset(ptr nocapture %p) {
; CHECK-LABEL: @struct_memset(
; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @memset_pattern16(ptr [[P:%.*]], ptr @.memset_pattern.1, i64 128), !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i64.i64(ptr [[P:%.*]], i64 4614256650576692846, i64 16, i1 false), !tbaa [[TBAA4:![0-9]+]]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
@@ -73,8 +68,7 @@ for.body:
define dso_local void @var_memset(ptr nocapture %p, i64 %len) {
; CHECK-LABEL: @var_memset(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = shl nuw i64 [[LEN:%.*]], 3
-; CHECK-NEXT: call void @memset_pattern16(ptr [[P:%.*]], ptr @.memset_pattern.2, i64 [[TMP0]])
+; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i64.i64(ptr [[P:%.*]], i64 4614256650576692846, i64 [[TMP0:%.*]], i1 false)
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
@@ -82,7 +76,7 @@ define dso_local void @var_memset(ptr nocapture %p, i64 %len) {
; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[I_07]]
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[LEN]]
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[TMP0]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
;
entry:
@@ -116,7 +110,7 @@ for.body:
!21 = !{!22, !20, i64 0}
!22 = !{!"B", !20, i64 0}
;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nofree nounwind willreturn memory(argmem: readwrite) }
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
;.
; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
; CHECK: [[META1]] = !{!"double", [[META2:![0-9]+]], i64 0}
diff --git a/llvm/test/Transforms/LoopIdiom/struct_pattern.ll b/llvm/test/Transforms/LoopIdiom/struct_pattern.ll
index b65e95353ab3e..f5be8e71cf7bd 100644
--- a/llvm/test/Transforms/LoopIdiom/struct_pattern.ll
+++ b/llvm/test/Transforms/LoopIdiom/struct_pattern.ll
@@ -16,11 +16,6 @@ target triple = "x86_64-apple-darwin10.0.0"
;}
-;.
-; CHECK: @.memset_pattern = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16
-; CHECK: @.memset_pattern.1 = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16
-; CHECK: @.memset_pattern.2 = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16
-;.
define void @bar1(ptr %f, i32 %n) nounwind ssp {
; CHECK-LABEL: @bar1(
; CHECK-NEXT: entry:
@@ -28,8 +23,8 @@ define void @bar1(ptr %f, i32 %n) nounwind ssp {
; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
-; CHECK-NEXT: call void @memset_pattern16(ptr [[F:%.*]], ptr @.memset_pattern, i64 [[TMP1]])
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr [[F:%.*]], i32 2, i64 [[TMP1]], i1 false)
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -82,8 +77,8 @@ define void @bar2(ptr %f, i32 %n) nounwind ssp {
; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
-; CHECK-NEXT: call void @memset_pattern16(ptr [[F:%.*]], ptr @.memset_pattern.1, i64 [[TMP1]])
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr [[F:%.*]], i32 2, i64 [[TMP1]], i1 false)
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -142,7 +137,8 @@ define void @bar3(ptr nocapture %f, i32 %n) nounwind ssp {
; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 3
; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP1]], [[TMP4]]
; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[F:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: call void @memset_pattern16(ptr [[UGLYGEP]], ptr @.memset_pattern.2, i64 [[TMP1]])
+; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr [[UGLYGEP]], i32 2, i64 [[TMP7]], i1 false)
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -292,5 +288,5 @@ for.end: ; preds = %for.end.loopexit, %
}
;.
; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind ssp }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nofree nounwind willreturn memory(argmem: readwrite) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
;.
diff --git a/llvm/test/Transforms/LoopIdiom/unroll-custom-dl.ll b/llvm/test/Transforms/LoopIdiom/unroll-custom-dl.ll
index a3b421907885b..0a41b68caced8 100644
--- a/llvm/test/Transforms/LoopIdiom/unroll-custom-dl.ll
+++ b/llvm/test/Transforms/LoopIdiom/unroll-custom-dl.ll
@@ -11,9 +11,6 @@ target triple = "x86_64-apple-darwin10.0.0"
; f[i+1] = 0;
; }
;}
-;.
-; CHECK: @.memset_pattern = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16
-;.
define void @test(ptr %f, i32 %n) nounwind ssp {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
@@ -81,9 +78,9 @@ define void @test_pattern(ptr %f, i32 %n) nounwind ssp {
; CHECK: for.body.preheader:
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[MUL]], -1
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP1]], 3
-; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 8
-; CHECK-NEXT: call void @memset_pattern16(ptr [[F:%.*]], ptr @.memset_pattern, i32 [[TMP3]])
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[TMP1]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 2
+; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i32.i32(ptr [[F:%.*]], i32 2, i32 [[TMP3]], i1 false)
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -126,5 +123,4 @@ for.end: ; preds = %for.end.loopexit, %
;.
; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind ssp }
; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { nofree nounwind willreturn memory(argmem: readwrite) }
;.
diff --git a/llvm/test/Transforms/LoopIdiom/unroll.ll b/llvm/test/Transforms/LoopIdiom/unroll.ll
index c70eeefd6376d..a40477168d374 100644
--- a/llvm/test/Transforms/LoopIdiom/unroll.ll
+++ b/llvm/test/Transforms/LoopIdiom/unroll.ll
@@ -11,9 +11,6 @@ target triple = "x86_64-apple-darwin10.0.0"
; f[i+1] = 0;
; }
;}
-;.
-; CHECK: @.memset_pattern = private unnamed_addr constant [4 x i32] [i32 2, i32 2, i32 2, i32 2], align 16
-;.
define void @test(ptr %f, i32 %n) nounwind ssp {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
@@ -84,9 +81,9 @@ define void @test_pattern(ptr %f, i32 %n) nounwind ssp {
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[MUL]] to i64
; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], -1
; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 1
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], 8
-; CHECK-NEXT: call void @memset_pattern16(ptr [[F:%.*]], ptr @.memset_pattern, i64 [[TMP4]])
+; CHECK-NEXT: [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1
+; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
+; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr [[F:%.*]], i32 2, i64 [[TMP4]], i1 false)
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -130,5 +127,4 @@ for.end: ; preds = %for.end.loopexit, %
;.
; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind ssp }
; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { nofree nounwind willreturn memory(argmem: readwrite) }
;.
>From 809320fdb5370d76ad52e46526517b81804cb8f0 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Fri, 28 Feb 2025 13:39:33 +0000
Subject: [PATCH 2/5] Fix bit/bytes confusion in setting alignment
---
llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 2 +-
llvm/test/Transforms/LoopIdiom/basic.ll | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 58c12298ca926..f358cd714046c 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1154,7 +1154,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
PatternValue, ".memset_pattern");
GV->setUnnamedAddr(
GlobalValue::UnnamedAddr::Global); // Ok to merge these.
- GV->setAlignment(Align(PatternArgTy->getPrimitiveSizeInBits()));
+ GV->setAlignment(Align(PatternArgTy->getPrimitiveSizeInBits() / 8));
PatternArg = Builder.CreateLoad(PatternArgTy, GV);
}
assert(PatternArg);
diff --git a/llvm/test/Transforms/LoopIdiom/basic.ll b/llvm/test/Transforms/LoopIdiom/basic.ll
index 0fe8cd747408f..c55a10470ae8b 100644
--- a/llvm/test/Transforms/LoopIdiom/basic.ll
+++ b/llvm/test/Transforms/LoopIdiom/basic.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-apple-darwin10.0.0"
;.
; CHECK: @G = global i32 5
; CHECK: @g_50 = global [7 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0], align 16
-; CHECK: @.memset_pattern = private unnamed_addr constant ptr @G, align 64
+; CHECK: @.memset_pattern = private unnamed_addr constant ptr @G, align 8
;.
define void @test1(ptr %Base, i64 %Size) nounwind ssp {
; CHECK-LABEL: @test1(
>From 808e5c25c18ee0e5af95110fc76972f8115b5882 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 5 Mar 2025 13:11:31 +0000
Subject: [PATCH 3/5] Excise unneeded logic for creating a globalvariable
(which was still done for global pointers)
We can just do a ptrtoint.
---
.../Transforms/Scalar/LoopIdiomRecognize.cpp | 23 +++++++++----------
llvm/test/Transforms/LoopIdiom/basic.ll | 4 +---
2 files changed, 12 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index f358cd714046c..373382282fe24 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -402,6 +402,11 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
if (Size > 16)
return nullptr;
+ // For now, don't handle types that aren't int, floats, or pointers.
+ if (!isa<ConstantInt>(C) && !isa<ConstantFP>(C) &&
+ !isa<PointerType>(C->getType()))
+ return nullptr;
+
return C;
}
@@ -1144,20 +1149,14 @@ bool LoopIdiomRecognize::processLoopStridedStore(
// If the pattern value can be casted directly to an integer argument, use
// that. Otherwise (e.g. if the value is a global pointer), create a
// GlobalVariable and load from it.
- if (isa<ConstantInt>(PatternValue)) {
+ if (isa<ConstantInt>(PatternValue))
PatternArg = PatternValue;
- } else if (isa<ConstantFP>(PatternValue)) {
+ else if (isa<ConstantFP>(PatternValue))
PatternArg = Builder.CreateBitCast(PatternValue, PatternArgTy);
- } else {
- GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
- GlobalValue::PrivateLinkage,
- PatternValue, ".memset_pattern");
- GV->setUnnamedAddr(
- GlobalValue::UnnamedAddr::Global); // Ok to merge these.
- GV->setAlignment(Align(PatternArgTy->getPrimitiveSizeInBits() / 8));
- PatternArg = Builder.CreateLoad(PatternArgTy, GV);
- }
- assert(PatternArg);
+ else if (isa<PointerType>(PatternValue->getType()))
+ PatternArg = Builder.CreatePtrToInt(PatternValue, PatternArgTy);
+ else
+ report_fatal_error("Unexpected PatternValue type");
NewCall = Builder.CreateIntrinsic(Intrinsic::experimental_memset_pattern,
{DestInt8PtrTy, PatternArgTy, IntIdxTy},
diff --git a/llvm/test/Transforms/LoopIdiom/basic.ll b/llvm/test/Transforms/LoopIdiom/basic.ll
index c55a10470ae8b..1d65a2cd27065 100644
--- a/llvm/test/Transforms/LoopIdiom/basic.ll
+++ b/llvm/test/Transforms/LoopIdiom/basic.ll
@@ -7,7 +7,6 @@ target triple = "x86_64-apple-darwin10.0.0"
;.
; CHECK: @G = global i32 5
; CHECK: @g_50 = global [7 x i32] [i32 0, i32 0, i32 0, i32 0, i32 1, i32 0, i32 0], align 16
-; CHECK: @.memset_pattern = private unnamed_addr constant ptr @G, align 8
;.
define void @test1(ptr %Base, i64 %Size) nounwind ssp {
; CHECK-LABEL: @test1(
@@ -595,8 +594,7 @@ for.end: ; preds = %for.body
define void @test13_pattern(ptr nocapture %P) nounwind ssp {
; CHECK-LABEL: @test13_pattern(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr @.memset_pattern, align 8
-; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i64.i64(ptr [[P:%.*]], i64 [[TMP0]], i64 10000, i1 false)
+; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i64.i64(ptr [[P:%.*]], i64 ptrtoint (ptr @G to i64), i64 10000, i1 false)
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ]
>From 763b40a337b7a93469cfedcff67cc75806a98e35 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 12 Mar 2025 03:33:14 +0000
Subject: [PATCH 4/5] Remove unnecessary initialisation of std::optional
---
llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 373382282fe24..aa6288439b8c2 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1076,7 +1076,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
// MemsetArg is the number of bytes for the memset libcall, and the number
// of pattern repetitions if the memset.pattern intrinsic is being used.
Value *MemsetArg;
- std::optional<int64_t> BytesWritten = std::nullopt;
+ std::optional<int64_t> BytesWritten;
if (PatternValue && (HasMemsetPattern || ForceMemsetPatternIntrinsic)) {
const SCEV *TripCountS =
>From 48a9ed8cceb1699ab205a9c8dbc892f462ba57b7 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb at igalia.com>
Date: Wed, 12 Mar 2025 04:15:46 +0000
Subject: [PATCH 5/5] auto *CI and remove comment that's made redundant
---
llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 8 +++-----
1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index aa6288439b8c2..85aaee56e9ee8 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1100,7 +1100,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
: Builder.CreateMul(TripCount,
Builder.getIntN(IntIdxTy->getIntegerBitWidth(),
PatternRepsPerTrip));
- if (auto CI = dyn_cast<ConstantInt>(TripCount))
+ if (auto *CI = dyn_cast<ConstantInt>(TripCount))
BytesWritten =
CI->getZExtValue() * ConstStoreSize->getValue()->getZExtValue();
} else {
@@ -1113,7 +1113,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
return Changed;
MemsetArg =
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
- if (auto CI = dyn_cast<ConstantInt>(MemsetArg))
+ if (auto *CI = dyn_cast<ConstantInt>(MemsetArg))
BytesWritten = CI->getZExtValue();
}
assert(MemsetArg && "MemsetArg should have been set");
@@ -1138,8 +1138,6 @@ bool LoopIdiomRecognize::processLoopStridedStore(
} else {
assert(ForceMemsetPatternIntrinsic ||
isLibFuncEmittable(M, TLI, LibFunc_memset_pattern16));
- // Everything is emitted in default address space
-
assert(isa<SCEVConstant>(StoreSizeSCEV) && "Expected constant store size");
Value *PatternArg;
@@ -1451,7 +1449,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
AAMDNodes AATags = TheLoad->getAAMetadata();
AAMDNodes StoreAATags = TheStore->getAAMetadata();
AATags = AATags.merge(StoreAATags);
- if (auto CI = dyn_cast<ConstantInt>(NumBytes))
+ if (auto *CI = dyn_cast<ConstantInt>(NumBytes))
AATags = AATags.extendTo(CI->getZExtValue());
else
AATags = AATags.extendTo(-1);
More information about the llvm-commits
mailing list