[llvm] Fix/aarch64 memset dup optimization (PR #166030)
Osama Abdelkader via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 18 10:00:51 PST 2025
https://github.com/osamakader updated https://github.com/llvm/llvm-project/pull/166030
>From c9b595dfbdbd5893917677aa756bc9dbd4d5bcdc Mon Sep 17 00:00:00 2001
From: Osama Abdelkader <osama.abdelkader at gmail.com>
Date: Sun, 2 Nov 2025 14:41:16 +0200
Subject: [PATCH 1/2] Optimize AArch64 memset to use NEON DUP instruction for
small sizes
This change improves memset code generation for non-zero values on AArch64
for sizes 4, 8, and 16 bytes by using NEON's DUP instruction instead of
the less efficient multiplication with 0x01010101 pattern.
Changes:
1. In SelectionDAG.cpp: For AArch64 targets, generate vector splats for
scalar i32/i64 memset operations, which are then efficiently lowered to
DUP instructions.
2. In AArch64ISelLowering.cpp: Modify getOptimalMemOpType and
getOptimalMemOpLLT to return v16i8 for non-zero memset operations of
any size when NEON is available (previously only for sizes >= 32 bytes).
3. Update test expectations to verify the new DUP-based code generation
for both NEON and GPR code paths.
The optimization is restricted to AArch64 only to avoid breaking RISCV
and X86 tests.
Signed-off-by: Osama Abdelkader <osama.abdelkader at gmail.com>
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 14 +++
.../Target/AArch64/AArch64ISelLowering.cpp | 51 ++++++++---
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 3 +
llvm/test/CodeGen/AArch64/memset-inline.ll | 86 ++++++++++++-------
4 files changed, 113 insertions(+), 41 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 379242ec5a157..d1fcb802c5268 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8543,6 +8543,20 @@ static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
if (!IntVT.isInteger())
IntVT = EVT::getIntegerVT(*DAG.getContext(), IntVT.getSizeInBits());
+ // For repeated-byte patterns, generate a vector splat instead of MUL to
+ // enable efficient lowering to DUP on targets like AArch64.
+ // Only do this on AArch64 targets to avoid breaking other architectures.
+ const TargetMachine &TM = DAG.getTarget();
+ if (NumBits > 8 && VT.isInteger() && !VT.isVector() &&
+ (NumBits == 32 || NumBits == 64) &&
+ TM.getTargetTriple().getArch() == Triple::aarch64) {
+ // Generate a vector of bytes: v4i8 for i32, v8i8 for i64
+ EVT ByteVecTy = EVT::getVectorVT(*DAG.getContext(), MVT::i8, NumBits / 8);
+ SDValue VecSplat = DAG.getSplatBuildVector(ByteVecTy, dl, Value);
+ // Bitcast back to the target integer type
+ return DAG.getNode(ISD::BITCAST, dl, IntVT, VecSplat);
+ }
+
Value = DAG.getNode(ISD::ZERO_EXTEND, dl, IntVT, Value);
if (NumBits > 8) {
// Use a multiplication with 0x010101... to extend the input to the
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60aa61e993b26..170ae6ee8a89b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18328,10 +18328,11 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
- // Only use AdvSIMD to implement memset of 32-byte and above. It would have
+ // For zero memset, only use AdvSIMD for 32-byte and above. It would have
// taken one instruction to materialize the v2i64 zero and one store (with
// restrictive addressing mode). Just do i64 stores.
- bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
+ // For non-zero memset, use NEON even for smaller sizes as dup is efficient.
+ bool IsSmallZeroMemset = Op.isMemset() && Op.size() < 32 && Op.isZeroMemset();
auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
if (Op.isAligned(AlignCheck))
return true;
@@ -18341,10 +18342,12 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
Fast;
};
- if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
- AlignmentIsAcceptable(MVT::v16i8, Align(16)))
+ // For non-zero memset, use NEON even for smaller sizes as dup + scalar store
+ // is efficient
+ if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset)
return MVT::v16i8;
- if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
+ if (CanUseFP && !IsSmallZeroMemset &&
+ AlignmentIsAcceptable(MVT::f128, Align(16)))
return MVT::f128;
if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
return MVT::i64;
@@ -18358,10 +18361,11 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT(
bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
- // Only use AdvSIMD to implement memset of 32-byte and above. It would have
+ // For zero memset, only use AdvSIMD for 32-byte and above. It would have
// taken one instruction to materialize the v2i64 zero and one store (with
// restrictive addressing mode). Just do i64 stores.
- bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
+ // For non-zero memset, use NEON even for smaller sizes as dup is efficient.
+ bool IsSmallZeroMemset = Op.isMemset() && Op.size() < 32 && Op.isZeroMemset();
auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
if (Op.isAligned(AlignCheck))
return true;
@@ -18371,10 +18375,12 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT(
Fast;
};
- if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
- AlignmentIsAcceptable(MVT::v2i64, Align(16)))
+ // For non-zero memset, use NEON for all sizes where it's beneficial.
+ // NEON dup + scalar store works for any alignment and is efficient.
+ if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset)
return LLT::fixed_vector(2, 64);
- if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
+ if (CanUseFP && !IsSmallZeroMemset &&
+ AlignmentIsAcceptable(MVT::f128, Align(16)))
return LLT::scalar(128);
if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
return LLT::scalar(64);
@@ -29702,6 +29708,31 @@ AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
.getInstr();
}
+bool AArch64TargetLowering::shallExtractConstSplatVectorElementToStore(
+ Type *VectorTy, unsigned ElemSizeInBits, unsigned &Index) const {
+ // On AArch64, we can efficiently extract a scalar from a splat vector using
+ // str s/d/q0 which extracts 32/64/128 bits from the vector register.
+ // This is useful for memset where we generate a v16i8 splat and need to store
+ // a smaller scalar (e.g., i32 for a 4-byte memset).
+ if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(VectorTy)) {
+ // Only handle v16i8 splat (128 bits total, 16 elements of 8 bits each)
+ if (VTy->getNumElements() == 16 && VTy->getElementType()->isIntegerTy(8)) {
+ // Check if we're extracting a 32-bit or 64-bit element
+ if (ElemSizeInBits == 32) {
+ // Extract element 0 of the 128-bit vector as a 32-bit scalar
+ Index = 0;
+ return true;
+ }
+ if (ElemSizeInBits == 64) {
+ // Extract elements 0-7 as a 64-bit scalar
+ Index = 0;
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2cb8ed29f252a..37fadf8a2b0b1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -475,6 +475,9 @@ class AArch64TargetLowering : public TargetLowering {
MachineBasicBlock::instr_iterator &MBBI,
const TargetInstrInfo *TII) const override;
+ bool shallExtractConstSplatVectorElementToStore(
+ Type *VectorTy, unsigned ElemSizeInBits, unsigned &Index) const override;
+
/// Enable aggressive FMA fusion on targets that want it.
bool enableAggressiveFMAFusion(EVT VT) const override;
diff --git a/llvm/test/CodeGen/AArch64/memset-inline.ll b/llvm/test/CodeGen/AArch64/memset-inline.ll
index 02d852b5ce45a..ed9a752dc1f8d 100644
--- a/llvm/test/CodeGen/AArch64/memset-inline.ll
+++ b/llvm/test/CodeGen/AArch64/memset-inline.ll
@@ -27,39 +27,57 @@ define void @memset_2(ptr %a, i8 %value) nounwind {
}
define void @memset_4(ptr %a, i8 %value) nounwind {
-; ALL-LABEL: memset_4:
-; ALL: // %bb.0:
-; ALL-NEXT: mov w8, #16843009
-; ALL-NEXT: and w9, w1, #0xff
-; ALL-NEXT: mul w8, w9, w8
-; ALL-NEXT: str w8, [x0]
-; ALL-NEXT: ret
+; GPR-LABEL: memset_4:
+; GPR: // %bb.0:
+; GPR-NEXT: mov w8, #16843009
+; GPR-NEXT: and w9, w1, #0xff
+; GPR-NEXT: mul w8, w9, w8
+; GPR-NEXT: str w8, [x0]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: memset_4:
+; NEON: // %bb.0:
+; NEON-NEXT: dup v0.8b, w1
+; NEON-NEXT: str s0, [x0]
+; NEON-NEXT: ret
tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 4, i1 0)
ret void
}
define void @memset_8(ptr %a, i8 %value) nounwind {
-; ALL-LABEL: memset_8:
-; ALL: // %bb.0:
-; ALL-NEXT: // kill: def $w1 killed $w1 def $x1
-; ALL-NEXT: mov x8, #72340172838076673
-; ALL-NEXT: and x9, x1, #0xff
-; ALL-NEXT: mul x8, x9, x8
-; ALL-NEXT: str x8, [x0]
-; ALL-NEXT: ret
+; GPR-LABEL: memset_8:
+; GPR: // %bb.0:
+; GPR-NEXT: // kill: def $w1 killed $w1 def $x1
+; GPR-NEXT: mov x8, #72340172838076673
+; GPR-NEXT: and x9, x1, #0xff
+; GPR-NEXT: mul x8, x9, x8
+; GPR-NEXT: str x8, [x0]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: memset_8:
+; NEON: // %bb.0:
+; NEON-NEXT: dup v0.8b, w1
+; NEON-NEXT: str d0, [x0]
+; NEON-NEXT: ret
tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 8, i1 0)
ret void
}
define void @memset_16(ptr %a, i8 %value) nounwind {
-; ALL-LABEL: memset_16:
-; ALL: // %bb.0:
-; ALL-NEXT: // kill: def $w1 killed $w1 def $x1
-; ALL-NEXT: mov x8, #72340172838076673
-; ALL-NEXT: and x9, x1, #0xff
-; ALL-NEXT: mul x8, x9, x8
-; ALL-NEXT: stp x8, x8, [x0]
-; ALL-NEXT: ret
+; GPR-LABEL: memset_16:
+; GPR: // %bb.0:
+; GPR-NEXT: // kill: def $w1 killed $w1 def $x1
+; GPR-NEXT: mov x8, #72340172838076673
+; GPR-NEXT: and x9, x1, #0xff
+; GPR-NEXT: mul x8, x9, x8
+; GPR-NEXT: stp x8, x8, [x0]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: memset_16:
+; NEON: // %bb.0:
+; NEON-NEXT: dup v0.16b, w1
+; NEON-NEXT: str q0, [x0]
+; NEON-NEXT: ret
tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 16, i1 0)
ret void
}
@@ -110,14 +128,20 @@ define void @memset_64(ptr %a, i8 %value) nounwind {
; /////////////////////////////////////////////////////////////////////////////
define void @aligned_memset_16(ptr align 16 %a, i8 %value) nounwind {
-; ALL-LABEL: aligned_memset_16:
-; ALL: // %bb.0:
-; ALL-NEXT: // kill: def $w1 killed $w1 def $x1
-; ALL-NEXT: mov x8, #72340172838076673
-; ALL-NEXT: and x9, x1, #0xff
-; ALL-NEXT: mul x8, x9, x8
-; ALL-NEXT: stp x8, x8, [x0]
-; ALL-NEXT: ret
+; GPR-LABEL: aligned_memset_16:
+; GPR: // %bb.0:
+; GPR-NEXT: // kill: def $w1 killed $w1 def $x1
+; GPR-NEXT: mov x8, #72340172838076673
+; GPR-NEXT: and x9, x1, #0xff
+; GPR-NEXT: mul x8, x9, x8
+; GPR-NEXT: stp x8, x8, [x0]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: aligned_memset_16:
+; NEON: // %bb.0:
+; NEON-NEXT: dup v0.16b, w1
+; NEON-NEXT: str q0, [x0]
+; NEON-NEXT: ret
tail call void @llvm.memset.inline.p0.i64(ptr align 16 %a, i8 %value, i64 16, i1 0)
ret void
}
>From 5415817eca5b2b3c6f1e1f08977e0289bd333286 Mon Sep 17 00:00:00 2001
From: Osama Abdelkader <osama.abdelkader at gmail.com>
Date: Tue, 18 Nov 2025 19:51:10 +0200
Subject: [PATCH 2/2] AArch64: move memset optimize to AArch64 Lowering
Signed-off-by: Osama Abdelkader <osama.abdelkader at gmail.com>
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 20 +--
.../Target/AArch64/AArch64ISelLowering.cpp | 54 ++++--
llvm/lib/Target/AArch64/AArch64ISelLowering.h | 6 +
.../AArch64/GlobalISel/inline-memset.mir | 24 +--
llvm/test/CodeGen/AArch64/aarch64-mops.ll | 86 ++++------
.../CodeGen/AArch64/arm64-memset-inline.ll | 29 ++--
llvm/test/CodeGen/AArch64/memcpy-scoped-aa.ll | 13 +-
llvm/test/CodeGen/AArch64/memset-inline.ll | 162 +++++++++++++++++-
.../AArch64/memset-vs-memset-inline.ll | 7 +-
.../CodeGen/AArch64/mops-register-alias.ll | 11 +-
10 files changed, 287 insertions(+), 125 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d1fcb802c5268..4a84cd1c5ee3b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8543,20 +8543,6 @@ static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
if (!IntVT.isInteger())
IntVT = EVT::getIntegerVT(*DAG.getContext(), IntVT.getSizeInBits());
- // For repeated-byte patterns, generate a vector splat instead of MUL to
- // enable efficient lowering to DUP on targets like AArch64.
- // Only do this on AArch64 targets to avoid breaking other architectures.
- const TargetMachine &TM = DAG.getTarget();
- if (NumBits > 8 && VT.isInteger() && !VT.isVector() &&
- (NumBits == 32 || NumBits == 64) &&
- TM.getTargetTriple().getArch() == Triple::aarch64) {
- // Generate a vector of bytes: v4i8 for i32, v8i8 for i64
- EVT ByteVecTy = EVT::getVectorVT(*DAG.getContext(), MVT::i8, NumBits / 8);
- SDValue VecSplat = DAG.getSplatBuildVector(ByteVecTy, dl, Value);
- // Bitcast back to the target integer type
- return DAG.getNode(ISD::BITCAST, dl, IntVT, VecSplat);
- }
-
Value = DAG.getNode(ISD::ZERO_EXTEND, dl, IntVT, Value);
if (NumBits > 8) {
// Use a multiplication with 0x010101... to extend the input to the
@@ -9089,6 +9075,12 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
for (unsigned i = 0; i < NumMemOps; i++) {
EVT VT = MemOps[i];
unsigned VTSize = VT.getSizeInBits() / 8;
+ // Skip stores when Size is already 0. This can happen when an oversized
+ // store was added to MemOps but the actual memset size was already
+ // covered by previous stores (e.g., when using extraction from a larger
+ // vector splat).
+ if (Size == 0)
+ continue;
if (VTSize > Size) {
// Issuing an unaligned load / store pair that overlaps with the previous
// pair. Adjust the offset accordingly.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 170ae6ee8a89b..8596ec1b04ebf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18344,7 +18344,8 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
// For non-zero memset, use NEON even for smaller sizes as dup + scalar store
// is efficient
- if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset)
+ if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset &&
+ AlignmentIsAcceptable(MVT::v16i8, Align(1)))
return MVT::v16i8;
if (CanUseFP && !IsSmallZeroMemset &&
AlignmentIsAcceptable(MVT::f128, Align(16)))
@@ -18356,6 +18357,39 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
return MVT::Other;
}
+bool AArch64TargetLowering::findOptimalMemOpLowering(
+ LLVMContext &Context, std::vector<EVT> &MemOps, unsigned Limit,
+ const MemOp &Op, unsigned DstAS, unsigned SrcAS,
+ const AttributeList &FuncAttributes) const {
+ // For non-zero memset with v16i8, don't downgrade if we can extract
+ // the needed size efficiently using
+ // shallExtractConstSplatVectorElementToStore
+ EVT VT = getOptimalMemOpType(Context, Op, FuncAttributes);
+ if (VT == MVT::v16i8 && Op.isMemset() && !Op.isZeroMemset() &&
+ Op.size() < 16) {
+ // Check if we can extract the needed size
+ unsigned Index;
+ Type *VectorTy = VT.getTypeForEVT(Context);
+ if (shallExtractConstSplatVectorElementToStore(VectorTy, Op.size() * 8,
+ Index)) {
+ // To generate the vector splat (DUP), we need v16i8 to be the LargestVT.
+ // getMemsetStores requires oversized stores to be last with at least 2
+ // operations. We add the target size first (extracts from v16i8), then
+ // v16i8 last (satisfies assertion, and is LargestVT for splat
+ // generation). After the first store, Size becomes 0, so the oversized
+ // store is skipped by the early continue in getMemsetStores, avoiding
+ // redundant stores.
+ EVT TargetVT = (Op.size() >= 8) ? MVT::i64 : MVT::i32;
+ MemOps.push_back(TargetVT); // First: extract from v16i8
+ MemOps.push_back(VT); // Last: v16i8 (LargestVT, oversized)
+ return true;
+ }
+ }
+ // Otherwise, use the default implementation
+ return TargetLowering::findOptimalMemOpLowering(Context, MemOps, Limit, Op,
+ DstAS, SrcAS, FuncAttributes);
+}
+
LLT AArch64TargetLowering::getOptimalMemOpLLT(
const MemOp &Op, const AttributeList &FuncAttributes) const {
bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
@@ -18377,7 +18411,8 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT(
// For non-zero memset, use NEON for all sizes where it's beneficial.
// NEON dup + scalar store works for any alignment and is efficient.
- if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset)
+ if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset &&
+ AlignmentIsAcceptable(MVT::v16i8, Align(1)))
return LLT::fixed_vector(2, 64);
if (CanUseFP && !IsSmallZeroMemset &&
AlignmentIsAcceptable(MVT::f128, Align(16)))
@@ -29715,16 +29750,13 @@ bool AArch64TargetLowering::shallExtractConstSplatVectorElementToStore(
// This is useful for memset where we generate a v16i8 splat and need to store
// a smaller scalar (e.g., i32 for a 4-byte memset).
if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(VectorTy)) {
- // Only handle v16i8 splat (128 bits total, 16 elements of 8 bits each)
- if (VTy->getNumElements() == 16 && VTy->getElementType()->isIntegerTy(8)) {
+ // Handle v16i8 splat (128 bits total, 16 elements of 8 bits each) and
+ // v8i8 splat (64 bits total, 8 elements of 8 bits each)
+ if ((VTy->getNumElements() == 16 || VTy->getNumElements() == 8) &&
+ VTy->getElementType()->isIntegerTy(8)) {
// Check if we're extracting a 32-bit or 64-bit element
- if (ElemSizeInBits == 32) {
- // Extract element 0 of the 128-bit vector as a 32-bit scalar
- Index = 0;
- return true;
- }
- if (ElemSizeInBits == 64) {
- // Extract elements 0-7 as a 64-bit scalar
+ if (ElemSizeInBits == 32 || ElemSizeInBits == 64) {
+ // Extract element 0 from the vector as a scalar
Index = 0;
return true;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 37fadf8a2b0b1..6dfff3c4c6bac 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -261,6 +261,12 @@ class AArch64TargetLowering : public TargetLowering {
LLT getOptimalMemOpLLT(const MemOp &Op,
const AttributeList &FuncAttributes) const override;
+ bool
+ findOptimalMemOpLowering(LLVMContext &Context, std::vector<EVT> &MemOps,
+ unsigned Limit, const MemOp &Op, unsigned DstAS,
+ unsigned SrcAS,
+ const AttributeList &FuncAttributes) const override;
+
/// Return true if the addressing mode represented by AM is legal for this
/// target, for a load/store of the specified type.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
index e4d2ca32468f6..af01789fa75d4 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
@@ -98,10 +98,8 @@ body: |
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s8)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673
; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
- ; CHECK-NEXT: G_STORE [[MUL]](s64), [[COPY]](p0) :: (store (s64) into %ir.dst, align 1)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
- ; CHECK-NEXT: G_STORE [[MUL]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.dst + 8, align 1)
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MUL]](s64), [[MUL]](s64)
+ ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s64>), [[COPY]](p0) :: (store (<2 x s64>) into %ir.dst, align 1)
; CHECK-NEXT: RET_ReallyLR
%0:_(p0) = COPY $x0
%1:_(s32) = COPY $w1
@@ -158,10 +156,8 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4629771061636907072
- ; CHECK-NEXT: G_STORE [[C]](s64), [[COPY]](p0) :: (store (s64) into %ir.dst, align 1)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
- ; CHECK-NEXT: G_STORE [[C]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.dst + 8, align 1)
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64)
+ ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s64>), [[COPY]](p0) :: (store (<2 x s64>) into %ir.dst, align 1)
; CHECK-NEXT: RET_ReallyLR
%0:_(p0) = COPY $x0
%1:_(s8) = G_CONSTANT i8 64
@@ -220,10 +216,8 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4629771061636907072
- ; CHECK-NEXT: G_STORE [[C]](s64), [[COPY]](p0) :: (store (s64) into %ir.dst, align 1)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
- ; CHECK-NEXT: G_STORE [[C]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.dst + 8, align 1)
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64)
+ ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s64>), [[COPY]](p0) :: (store (<2 x s64>) into %ir.dst, align 1)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 16448
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[COPY]], [[C3]](s64)
@@ -252,10 +246,8 @@ body: |
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s8)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673
; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]]
- ; CHECK-NEXT: G_STORE [[MUL]](s64), [[COPY]](p0) :: (store (s64) into %ir.dst, align 1)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
- ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64)
- ; CHECK-NEXT: G_STORE [[MUL]](s64), [[PTR_ADD]](p0) :: (store (s64) into %ir.dst + 8, align 1)
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MUL]](s64), [[MUL]](s64)
+ ; CHECK-NEXT: G_STORE [[BUILD_VECTOR]](<2 x s64>), [[COPY]](p0) :: (store (<2 x s64>) into %ir.dst, align 1)
; CHECK-NEXT: RET_ReallyLR
%0:_(p0) = COPY $x0
%1:_(s32) = COPY $w1
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mops.ll b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
index 83530049a50d6..13a5464281a24 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mops.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mops.ll
@@ -391,45 +391,39 @@ entry:
define void @memset_10(ptr %dst, i32 %value) {
; GISel-WITHOUT-MOPS-O0-LABEL: memset_10:
; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O0-NEXT: // implicit-def: $x8
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, w1
-; GISel-WITHOUT-MOPS-O0-NEXT: and x8, x8, #0xff
-; GISel-WITHOUT-MOPS-O0-NEXT: mov x9, #72340172838076673 // =0x101010101010101
-; GISel-WITHOUT-MOPS-O0-NEXT: mul x8, x8, x9
-; GISel-WITHOUT-MOPS-O0-NEXT: str x8, [x0]
-; GISel-WITHOUT-MOPS-O0-NEXT: // kill: def $w8 killed $w8 killed $x8
+; GISel-WITHOUT-MOPS-O0-NEXT: and w8, w1, #0xff
+; GISel-WITHOUT-MOPS-O0-NEXT: mov w9, #16843009 // =0x1010101
+; GISel-WITHOUT-MOPS-O0-NEXT: mul w8, w8, w9
+; GISel-WITHOUT-MOPS-O0-NEXT: str w8, [x0]
+; GISel-WITHOUT-MOPS-O0-NEXT: str w8, [x0, #4]
; GISel-WITHOUT-MOPS-O0-NEXT: strh w8, [x0, #8]
; GISel-WITHOUT-MOPS-O0-NEXT: ret
;
; GISel-WITHOUT-MOPS-O3-LABEL: memset_10:
; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O3-NEXT: // kill: def $w1 killed $w1 def $x1
-; GISel-WITHOUT-MOPS-O3-NEXT: mov x8, #72340172838076673 // =0x101010101010101
-; GISel-WITHOUT-MOPS-O3-NEXT: and x9, x1, #0xff
-; GISel-WITHOUT-MOPS-O3-NEXT: mul x8, x9, x8
-; GISel-WITHOUT-MOPS-O3-NEXT: str x8, [x0]
+; GISel-WITHOUT-MOPS-O3-NEXT: mov w8, #16843009 // =0x1010101
+; GISel-WITHOUT-MOPS-O3-NEXT: and w9, w1, #0xff
+; GISel-WITHOUT-MOPS-O3-NEXT: mul w8, w9, w8
+; GISel-WITHOUT-MOPS-O3-NEXT: stp w8, w8, [x0]
; GISel-WITHOUT-MOPS-O3-NEXT: strh w8, [x0, #8]
; GISel-WITHOUT-MOPS-O3-NEXT: ret
;
; GISel-MOPS-O0-LABEL: memset_10:
; GISel-MOPS-O0: // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT: // implicit-def: $x8
-; GISel-MOPS-O0-NEXT: mov w8, w1
-; GISel-MOPS-O0-NEXT: and x8, x8, #0xff
-; GISel-MOPS-O0-NEXT: mov x9, #72340172838076673 // =0x101010101010101
-; GISel-MOPS-O0-NEXT: mul x8, x8, x9
-; GISel-MOPS-O0-NEXT: str x8, [x0]
-; GISel-MOPS-O0-NEXT: // kill: def $w8 killed $w8 killed $x8
+; GISel-MOPS-O0-NEXT: and w8, w1, #0xff
+; GISel-MOPS-O0-NEXT: mov w9, #16843009 // =0x1010101
+; GISel-MOPS-O0-NEXT: mul w8, w8, w9
+; GISel-MOPS-O0-NEXT: str w8, [x0]
+; GISel-MOPS-O0-NEXT: str w8, [x0, #4]
; GISel-MOPS-O0-NEXT: strh w8, [x0, #8]
; GISel-MOPS-O0-NEXT: ret
;
; GISel-MOPS-O3-LABEL: memset_10:
; GISel-MOPS-O3: // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT: // kill: def $w1 killed $w1 def $x1
-; GISel-MOPS-O3-NEXT: mov x8, #72340172838076673 // =0x101010101010101
-; GISel-MOPS-O3-NEXT: and x9, x1, #0xff
-; GISel-MOPS-O3-NEXT: mul x8, x9, x8
-; GISel-MOPS-O3-NEXT: str x8, [x0]
+; GISel-MOPS-O3-NEXT: mov w8, #16843009 // =0x1010101
+; GISel-MOPS-O3-NEXT: and w9, w1, #0xff
+; GISel-MOPS-O3-NEXT: mul w8, w9, w8
+; GISel-MOPS-O3-NEXT: stp w8, w8, [x0]
; GISel-MOPS-O3-NEXT: strh w8, [x0, #8]
; GISel-MOPS-O3-NEXT: ret
;
@@ -461,45 +455,41 @@ entry:
define void @memset_10_volatile(ptr %dst, i32 %value) {
; GISel-WITHOUT-MOPS-O0-LABEL: memset_10_volatile:
; GISel-WITHOUT-MOPS-O0: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O0-NEXT: // implicit-def: $x8
-; GISel-WITHOUT-MOPS-O0-NEXT: mov w8, w1
-; GISel-WITHOUT-MOPS-O0-NEXT: and x8, x8, #0xff
-; GISel-WITHOUT-MOPS-O0-NEXT: mov x9, #72340172838076673 // =0x101010101010101
-; GISel-WITHOUT-MOPS-O0-NEXT: mul x8, x8, x9
-; GISel-WITHOUT-MOPS-O0-NEXT: str x8, [x0]
-; GISel-WITHOUT-MOPS-O0-NEXT: // kill: def $w8 killed $w8 killed $x8
+; GISel-WITHOUT-MOPS-O0-NEXT: and w8, w1, #0xff
+; GISel-WITHOUT-MOPS-O0-NEXT: mov w9, #16843009 // =0x1010101
+; GISel-WITHOUT-MOPS-O0-NEXT: mul w8, w8, w9
+; GISel-WITHOUT-MOPS-O0-NEXT: str w8, [x0]
+; GISel-WITHOUT-MOPS-O0-NEXT: str w8, [x0, #4]
; GISel-WITHOUT-MOPS-O0-NEXT: strh w8, [x0, #8]
; GISel-WITHOUT-MOPS-O0-NEXT: ret
;
; GISel-WITHOUT-MOPS-O3-LABEL: memset_10_volatile:
; GISel-WITHOUT-MOPS-O3: // %bb.0: // %entry
-; GISel-WITHOUT-MOPS-O3-NEXT: // kill: def $w1 killed $w1 def $x1
-; GISel-WITHOUT-MOPS-O3-NEXT: mov x8, #72340172838076673 // =0x101010101010101
-; GISel-WITHOUT-MOPS-O3-NEXT: and x9, x1, #0xff
-; GISel-WITHOUT-MOPS-O3-NEXT: mul x8, x9, x8
-; GISel-WITHOUT-MOPS-O3-NEXT: str x8, [x0]
+; GISel-WITHOUT-MOPS-O3-NEXT: mov w8, #16843009 // =0x1010101
+; GISel-WITHOUT-MOPS-O3-NEXT: and w9, w1, #0xff
+; GISel-WITHOUT-MOPS-O3-NEXT: mul w8, w9, w8
+; GISel-WITHOUT-MOPS-O3-NEXT: str w8, [x0]
+; GISel-WITHOUT-MOPS-O3-NEXT: str w8, [x0, #4]
; GISel-WITHOUT-MOPS-O3-NEXT: strh w8, [x0, #8]
; GISel-WITHOUT-MOPS-O3-NEXT: ret
;
; GISel-MOPS-O0-LABEL: memset_10_volatile:
; GISel-MOPS-O0: // %bb.0: // %entry
-; GISel-MOPS-O0-NEXT: // implicit-def: $x8
-; GISel-MOPS-O0-NEXT: mov w8, w1
-; GISel-MOPS-O0-NEXT: and x8, x8, #0xff
-; GISel-MOPS-O0-NEXT: mov x9, #72340172838076673 // =0x101010101010101
-; GISel-MOPS-O0-NEXT: mul x8, x8, x9
-; GISel-MOPS-O0-NEXT: str x8, [x0]
-; GISel-MOPS-O0-NEXT: // kill: def $w8 killed $w8 killed $x8
+; GISel-MOPS-O0-NEXT: and w8, w1, #0xff
+; GISel-MOPS-O0-NEXT: mov w9, #16843009 // =0x1010101
+; GISel-MOPS-O0-NEXT: mul w8, w8, w9
+; GISel-MOPS-O0-NEXT: str w8, [x0]
+; GISel-MOPS-O0-NEXT: str w8, [x0, #4]
; GISel-MOPS-O0-NEXT: strh w8, [x0, #8]
; GISel-MOPS-O0-NEXT: ret
;
; GISel-MOPS-O3-LABEL: memset_10_volatile:
; GISel-MOPS-O3: // %bb.0: // %entry
-; GISel-MOPS-O3-NEXT: // kill: def $w1 killed $w1 def $x1
-; GISel-MOPS-O3-NEXT: mov x8, #72340172838076673 // =0x101010101010101
-; GISel-MOPS-O3-NEXT: and x9, x1, #0xff
-; GISel-MOPS-O3-NEXT: mul x8, x9, x8
-; GISel-MOPS-O3-NEXT: str x8, [x0]
+; GISel-MOPS-O3-NEXT: mov w8, #16843009 // =0x1010101
+; GISel-MOPS-O3-NEXT: and w9, w1, #0xff
+; GISel-MOPS-O3-NEXT: mul w8, w9, w8
+; GISel-MOPS-O3-NEXT: str w8, [x0]
+; GISel-MOPS-O3-NEXT: str w8, [x0, #4]
; GISel-MOPS-O3-NEXT: strh w8, [x0, #8]
; GISel-MOPS-O3-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll b/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll
index 52b09885ebb1c..c6fdbdbe82305 100644
--- a/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll
@@ -331,7 +331,7 @@ define void @memset_8_stack() {
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: mov x8, #-6148914691236517206
-; CHECK-NEXT: stp x30, x8, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: stp x30, x8, [sp, #-16]!
; CHECK-NEXT: add x0, sp, #8
; CHECK-NEXT: bl something
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
@@ -367,12 +367,12 @@ define void @memset_16_stack() {
; CHECK-LABEL: memset_16_stack:
; CHECK: // %bb.0:
; CHECK-NEXT: sub sp, sp, #32
+; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: mov x8, #-6148914691236517206
+; CHECK-NEXT: movi v0.16b, #170
; CHECK-NEXT: mov x0, sp
-; CHECK-NEXT: stp x8, x30, [sp, #8] // 8-byte Folded Spill
-; CHECK-NEXT: str x8, [sp]
+; CHECK-NEXT: str q0, [sp]
; CHECK-NEXT: bl something
; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
; CHECK-NEXT: add sp, sp, #32
@@ -390,10 +390,10 @@ define void @memset_20_stack() {
; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: mov x8, #-6148914691236517206
-; CHECK-NEXT: add x0, sp, #8
-; CHECK-NEXT: stp x8, x8, [sp, #8]
-; CHECK-NEXT: str w8, [sp, #24]
+; CHECK-NEXT: movi v0.16b, #170
+; CHECK-NEXT: mov x0, sp
+; CHECK-NEXT: str q0, [sp]
+; CHECK-NEXT: str s0, [sp, #16]
; CHECK-NEXT: bl something
; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
; CHECK-NEXT: add sp, sp, #48
@@ -411,11 +411,10 @@ define void @memset_26_stack() {
; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill
; CHECK-NEXT: .cfi_def_cfa_offset 48
; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: mov x8, #-6148914691236517206
+; CHECK-NEXT: movi v0.16b, #170
; CHECK-NEXT: mov x0, sp
-; CHECK-NEXT: stp x8, x8, [sp, #8]
-; CHECK-NEXT: str x8, [sp]
-; CHECK-NEXT: strh w8, [sp, #24]
+; CHECK-NEXT: stur q0, [sp, #10]
+; CHECK-NEXT: str q0, [sp]
; CHECK-NEXT: bl something
; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
; CHECK-NEXT: add sp, sp, #48
@@ -454,10 +453,9 @@ define void @memset_40_stack() {
; CHECK-NEXT: .cfi_def_cfa_offset 64
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: movi v0.16b, #170
-; CHECK-NEXT: mov x8, #-6148914691236517206
; CHECK-NEXT: mov x0, sp
-; CHECK-NEXT: str x8, [sp, #32]
; CHECK-NEXT: stp q0, q0, [sp]
+; CHECK-NEXT: str d0, [sp, #32]
; CHECK-NEXT: bl something
; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
; CHECK-NEXT: add sp, sp, #64
@@ -497,11 +495,10 @@ define void @memset_72_stack() {
; CHECK-NEXT: .cfi_def_cfa_offset 96
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: movi v0.16b, #170
-; CHECK-NEXT: mov x8, #-6148914691236517206
; CHECK-NEXT: mov x0, sp
-; CHECK-NEXT: str x8, [sp, #64]
; CHECK-NEXT: stp q0, q0, [sp]
; CHECK-NEXT: stp q0, q0, [sp, #32]
+; CHECK-NEXT: str d0, [sp, #64]
; CHECK-NEXT: bl something
; CHECK-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: add sp, sp, #96
diff --git a/llvm/test/CodeGen/AArch64/memcpy-scoped-aa.ll b/llvm/test/CodeGen/AArch64/memcpy-scoped-aa.ll
index 7dde168024278..cc27fd75a8b20 100644
--- a/llvm/test/CodeGen/AArch64/memcpy-scoped-aa.ll
+++ b/llvm/test/CodeGen/AArch64/memcpy-scoped-aa.ll
@@ -78,17 +78,16 @@ define i32 @test_memmove(ptr nocapture %p, ptr nocapture readonly %q) {
}
; MIR-LABEL: name: test_memset
-; MIR: %2:gpr64 = MOVi64imm -6148914691236517206
-; MIR-NEXT: STRXui %2, %0, 1 :: (store (s64) into %ir.p0 + 8, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
-; MIR-NEXT: STRXui %2, %0, 0 :: (store (s64) into %ir.p0, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
+; MIR: %2:fpr128 = MOVIv16b_ns 170
+; MIR-NEXT: STRQui killed %2, %0, 0 :: (store (s128) into %ir.p0, align 4, !alias.scope ![[SET0]], !noalias ![[SET1]])
define i32 @test_memset(ptr nocapture %p, ptr nocapture readonly %q) {
; CHECK-LABEL: test_memset:
; CHECK: // %bb.0:
-; CHECK-NEXT: ldp w10, w11, [x1]
+; CHECK-NEXT: movi v0.16b, #170
; CHECK-NEXT: mov x8, x0
-; CHECK-NEXT: mov x9, #-6148914691236517206 // =0xaaaaaaaaaaaaaaaa
-; CHECK-NEXT: stp x9, x9, [x8]
-; CHECK-NEXT: add w0, w10, w11
+; CHECK-NEXT: ldp w9, w10, [x1]
+; CHECK-NEXT: add w0, w9, w10
+; CHECK-NEXT: str q0, [x8]
; CHECK-NEXT: ret
%p0 = bitcast ptr %p to ptr
tail call void @llvm.memset.p0.i64(ptr noundef nonnull align 4 dereferenceable(16) %p0, i8 170, i64 16, i1 false), !alias.scope !2, !noalias !4
diff --git a/llvm/test/CodeGen/AArch64/memset-inline.ll b/llvm/test/CodeGen/AArch64/memset-inline.ll
index ed9a752dc1f8d..b3e0b4a5a69c5 100644
--- a/llvm/test/CodeGen/AArch64/memset-inline.ll
+++ b/llvm/test/CodeGen/AArch64/memset-inline.ll
@@ -37,7 +37,7 @@ define void @memset_4(ptr %a, i8 %value) nounwind {
;
; NEON-LABEL: memset_4:
; NEON: // %bb.0:
-; NEON-NEXT: dup v0.8b, w1
+; NEON-NEXT: dup v0.16b, w1
; NEON-NEXT: str s0, [x0]
; NEON-NEXT: ret
tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 4, i1 0)
@@ -56,7 +56,7 @@ define void @memset_8(ptr %a, i8 %value) nounwind {
;
; NEON-LABEL: memset_8:
; NEON: // %bb.0:
-; NEON-NEXT: dup v0.8b, w1
+; NEON-NEXT: dup v0.16b, w1
; NEON-NEXT: str d0, [x0]
; NEON-NEXT: ret
tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 8, i1 0)
@@ -318,3 +318,161 @@ define void @aligned_bzero_64(ptr %a) nounwind {
tail call void @llvm.memset.inline.p0.i64(ptr align 64 %a, i8 0, i64 64, i1 0)
ret void
}
+
+; /////////////////////////////////////////////////////////////////////////////
+; Test cases for non-zero constants
+
+define void @memset_4_const_42(ptr %a) nounwind {
+; GPR-LABEL: memset_4_const_42:
+; GPR: // %bb.0:
+; GPR-NEXT: mov w8, #10794
+; GPR-NEXT: movk w8, #10794, lsl #16
+; GPR-NEXT: str w8, [x0]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: memset_4_const_42:
+; NEON: // %bb.0:
+; NEON-NEXT: mov w8, #10794
+; NEON-NEXT: movk w8, #10794, lsl #16
+; NEON-NEXT: str w8, [x0]
+; NEON-NEXT: ret
+ tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 42, i64 4, i1 0)
+ ret void
+}
+
+define void @memset_8_const_255(ptr %a) nounwind {
+; GPR-LABEL: memset_8_const_255:
+; GPR: // %bb.0:
+; GPR-NEXT: mov x8, #-1
+; GPR-NEXT: str x8, [x0]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: memset_8_const_255:
+; NEON: // %bb.0:
+; NEON-NEXT: mov x8, #-1
+; NEON-NEXT: str x8, [x0]
+; NEON-NEXT: ret
+ tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 255, i64 8, i1 0)
+ ret void
+}
+
+define void @memset_16_const_128(ptr %a) nounwind {
+; GPR-LABEL: memset_16_const_128:
+; GPR: // %bb.0:
+; GPR-NEXT: adrp
+; GPR-NEXT: ldr q0,
+; GPR-NEXT: str q0, [x0]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: memset_16_const_128:
+; NEON: // %bb.0:
+; NEON-NEXT: movi v0.16b, #128
+; NEON-NEXT: str q0, [x0]
+; NEON-NEXT: ret
+ tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 128, i64 16, i1 0)
+ ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+; Test cases for non-power-of-two lengths
+
+define void @memset_3(ptr %a, i8 %value) nounwind {
+; ALL-LABEL: memset_3:
+; ALL: // %bb.0:
+; ALL-NEXT: strb w1, [x0, #2]
+; ALL-NEXT: bfi w1, w1, #8, #24
+; ALL-NEXT: strh w1, [x0]
+; ALL-NEXT: ret
+ tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 3, i1 0)
+ ret void
+}
+
+define void @memset_5(ptr %a, i8 %value) nounwind {
+; GPR-LABEL: memset_5:
+; GPR: // %bb.0:
+; GPR-NEXT: mov w8, #16843009
+; GPR-NEXT: and w9, w1, #0xff
+; GPR-NEXT: mul w8, w9, w8
+; GPR-NEXT: strb w8, [x0, #4]
+; GPR-NEXT: str w8, [x0]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: memset_5:
+; NEON: // %bb.0:
+; NEON-NEXT: mov w8, #16843009
+; NEON-NEXT: and w9, w1, #0xff
+; NEON-NEXT: mul w8, w9, w8
+; NEON-NEXT: strb w8, [x0, #4]
+; NEON-NEXT: str w8, [x0]
+; NEON-NEXT: ret
+ tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 5, i1 0)
+ ret void
+}
+
+define void @memset_6(ptr %a, i8 %value) nounwind {
+; GPR-LABEL: memset_6:
+; GPR: // %bb.0:
+; GPR-NEXT: mov w8, #16843009
+; GPR-NEXT: and w9, w1, #0xff
+; GPR-NEXT: mul w8, w9, w8
+; GPR-NEXT: strh w8, [x0, #4]
+; GPR-NEXT: str w8, [x0]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: memset_6:
+; NEON: // %bb.0:
+; NEON-NEXT: mov w8, #16843009
+; NEON-NEXT: and w9, w1, #0xff
+; NEON-NEXT: mul w8, w9, w8
+; NEON-NEXT: strh w8, [x0, #4]
+; NEON-NEXT: str w8, [x0]
+; NEON-NEXT: ret
+ tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 6, i1 0)
+ ret void
+}
+
+define void @memset_7(ptr %a, i8 %value) nounwind {
+; GPR-LABEL: memset_7:
+; GPR: // %bb.0:
+; GPR-NEXT: mov w8, #16843009
+; GPR-NEXT: and w9, w1, #0xff
+; GPR-NEXT: mul w8, w9, w8
+; GPR-NEXT: stur w8, [x0, #3]
+; GPR-NEXT: str w8, [x0]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: memset_7:
+; NEON: // %bb.0:
+; NEON-NEXT: mov w8, #16843009
+; NEON-NEXT: and w9, w1, #0xff
+; NEON-NEXT: mul w8, w9, w8
+; NEON-NEXT: stur w8, [x0, #3]
+; NEON-NEXT: str w8, [x0]
+; NEON-NEXT: ret
+ tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 7, i1 0)
+ ret void
+}
+
+define void @memset_12(ptr %a, i8 %value) nounwind {
+; GPR-LABEL: memset_12:
+; GPR: // %bb.0:
+; GPR-NEXT: // kill: def $w1 killed $w1 def $x1
+; GPR-NEXT: mov x8, #72340172838076673
+; GPR-NEXT: and x9, x1, #0xff
+; GPR-NEXT: mul x8, x9, x8
+; GPR-NEXT: str x8, [x0]
+; GPR-NEXT: str w8, [x0, #8]
+; GPR-NEXT: ret
+;
+; NEON-LABEL: memset_12:
+; NEON: // %bb.0:
+; NEON-NEXT: // kill: def $w1 killed $w1 def $x1
+; NEON-NEXT: mov x8, #72340172838076673
+; NEON-NEXT: and x9, x1, #0xff
+; NEON-NEXT: mul x8, x9, x8
+; NEON-NEXT: str x8, [x0]
+; NEON-NEXT: str w8, [x0, #8]
+; NEON-NEXT: ret
+ tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 12, i1 0)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/memset-vs-memset-inline.ll b/llvm/test/CodeGen/AArch64/memset-vs-memset-inline.ll
index 97cfb13bcd5eb..a970a8f8fc9c0 100644
--- a/llvm/test/CodeGen/AArch64/memset-vs-memset-inline.ll
+++ b/llvm/test/CodeGen/AArch64/memset-vs-memset-inline.ll
@@ -7,11 +7,8 @@ declare void @llvm.memset.inline.p0.i64(ptr nocapture, i8, i64, i1) nounwind
define void @test1(ptr %a, i8 %value) nounwind {
; CHECK-LABEL: test1:
; CHECK: // %bb.0:
-; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: mov x8, #72340172838076673
-; CHECK-NEXT: and x9, x1, #0xff
-; CHECK-NEXT: mul x8, x9, x8
-; CHECK-NEXT: str x8, [x0]
+; CHECK-NEXT: dup v0.16b, w1
+; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 8, i1 0)
ret void
diff --git a/llvm/test/CodeGen/AArch64/mops-register-alias.ll b/llvm/test/CodeGen/AArch64/mops-register-alias.ll
index 855ab959c4e76..963aa2774d692 100644
--- a/llvm/test/CodeGen/AArch64/mops-register-alias.ll
+++ b/llvm/test/CodeGen/AArch64/mops-register-alias.ll
@@ -3,12 +3,11 @@
define void @call_memset_intrinsic() #0 {
; CHECK-LABEL: call_memset_intrinsic:
; CHECK: // %bb.0: // %entry
-; CHECK: setp [x{{[0-9]+}}]!, x{{[0-9]+}}!, x{{[0-9]+}}
-; CHECK-NOT: setp [x{{[0-9]+}}]!, x[[REG:[0-9]+]]!, x[[REG]]
-; CHECK-NEXT: setm [x{{[0-9]+}}]!, x{{[0-9]+}}!, x{{[0-9]+}}
-; CHECK-NOT: setm [x{{[0-9]+}}]!, x[[REG:[0-9]+]]!, x[[REG]]
-; CHECK-NEXT: sete [x{{[0-9]+}}]!, x{{[0-9]+}}!, x{{[0-9]+}}
-; CHECK-NOT: sete [x{{[0-9]+}}]!, x[[REG:[0-9]+]]!, x[[REG]]
+; CHECK: movi v0.16b, #64
+; CHECK-NEXT: stp q0, q0, [sp]
+; CHECK-NEXT: stp q0, q0, [sp, #32]
+; CHECK-NEXT: stp q0, q0, [sp, #64]
+; CHECK-NEXT: stp q0, q0, [sp, #96]
entry:
%V0 = alloca [65 x i8], align 1
More information about the llvm-commits
mailing list