[llvm] b28eeb2 - [CodeGen] Generalise Hexagon flags for memop inline thresholds (#172829)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 9 04:08:40 PST 2026
Author: Ties Stuij
Date: 2026-01-09T12:08:35Z
New Revision: b28eeb28bea39148738dc375e8a97072a1907e64
URL: https://github.com/llvm/llvm-project/commit/b28eeb28bea39148738dc375e8a97072a1907e64
DIFF: https://github.com/llvm/llvm-project/commit/b28eeb28bea39148738dc375e8a97072a1907e64.diff
LOG: [CodeGen] Generalise Hexagon flags for memop inline thresholds (#172829)
Generalise the Hexagon cmdline options to control if memset, memcpy or memmove intrinsics should be inlined versus calling library functions, so they can be used by all backends:
• -max-store-memset
• -max-store-memcpy
• -max-store-memmove
These flags override the target-specific defaults set in TargetLowering (e.g., MaxStoresPerMemcpy) and allow fine-tuning of the inlining threshold for performance analysis and optimization.
The optsize variants (-max-store-memset-Os, -max-store-memcpy-Os, max-store-memmove-Os) from the Hexagon backend were removed, and now the above options control both.
The threshold is specified as a number of store operations, which is backend-specific. Operations requiring more stores than the threshold will call the corresponding library function instead of being inlined.
Added:
llvm/test/CodeGen/AArch64/max-stores-per-mem-ops.ll
Modified:
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/TargetLoweringBase.cpp
llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 8c01c58a0318f..080501bc5dff9 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1937,9 +1937,7 @@ class LLVM_ABI TargetLoweringBase {
/// to replace a call to llvm.memset. The value is set by the target at the
/// performance threshold for such a replacement. If OptSize is true,
/// return the limit for functions that have OptSize attribute.
- unsigned getMaxStoresPerMemset(bool OptSize) const {
- return OptSize ? MaxStoresPerMemsetOptSize : MaxStoresPerMemset;
- }
+ unsigned getMaxStoresPerMemset(bool OptSize) const;
/// Get maximum # of store operations permitted for llvm.memcpy
///
@@ -1947,9 +1945,7 @@ class LLVM_ABI TargetLoweringBase {
/// to replace a call to llvm.memcpy. The value is set by the target at the
/// performance threshold for such a replacement. If OptSize is true,
/// return the limit for functions that have OptSize attribute.
- unsigned getMaxStoresPerMemcpy(bool OptSize) const {
- return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
- }
+ unsigned getMaxStoresPerMemcpy(bool OptSize) const;
/// \brief Get maximum # of store operations to be glued together
///
@@ -1976,9 +1972,7 @@ class LLVM_ABI TargetLoweringBase {
/// to replace a call to llvm.memmove. The value is set by the target at the
/// performance threshold for such a replacement. If OptSize is true,
/// return the limit for functions that have OptSize attribute.
- unsigned getMaxStoresPerMemmove(bool OptSize) const {
- return OptSize ? MaxStoresPerMemmoveOptSize : MaxStoresPerMemmove;
- }
+ unsigned getMaxStoresPerMemmove(bool OptSize) const;
/// Determine if the target supports unaligned memory accesses.
///
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index d7a8adc9b9d2b..c6aaea412a760 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -95,6 +95,24 @@ static cl::opt<unsigned> MinimumBitTestCmpsOverride(
cl::desc("Set minimum of largest number of comparisons "
"to use bit test for switch."));
+static cl::opt<unsigned> MaxStoresPerMemsetOverride(
+ "max-store-memset", cl::init(0), cl::Hidden,
+ cl::desc("Override target's MaxStoresPerMemset and "
+ "MaxStoresPerMemsetOptSize. "
+ "Set to 0 to use the target default."));
+
+static cl::opt<unsigned> MaxStoresPerMemcpyOverride(
+ "max-store-memcpy", cl::init(0), cl::Hidden,
+ cl::desc("Override target's MaxStoresPerMemcpy and "
+ "MaxStoresPerMemcpyOptSize. "
+ "Set to 0 to use the target default."));
+
+static cl::opt<unsigned> MaxStoresPerMemmoveOverride(
+ "max-store-memmove", cl::init(0), cl::Hidden,
+ cl::desc("Override target's MaxStoresPerMemmove and "
+ "MaxStoresPerMemmoveOptSize. "
+ "Set to 0 to use the target default."));
+
// FIXME: This option is only to test if the strict fp operation processed
// correctly by preventing mutating strict fp operation to normal fp operation
// during development. When the backend supports strict float operation, this
@@ -2116,6 +2134,27 @@ bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
MMO.getFlags(), Fast);
}
+unsigned TargetLoweringBase::getMaxStoresPerMemset(bool OptSize) const {
+ if (MaxStoresPerMemsetOverride > 0)
+ return MaxStoresPerMemsetOverride;
+
+ return OptSize ? MaxStoresPerMemsetOptSize : MaxStoresPerMemset;
+}
+
+unsigned TargetLoweringBase::getMaxStoresPerMemcpy(bool OptSize) const {
+ if (MaxStoresPerMemcpyOverride > 0)
+ return MaxStoresPerMemcpyOverride;
+
+ return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
+}
+
+unsigned TargetLoweringBase::getMaxStoresPerMemmove(bool OptSize) const {
+ if (MaxStoresPerMemmoveOverride > 0)
+ return MaxStoresPerMemmoveOverride;
+
+ return OptSize ? MaxStoresPerMemmoveOptSize : MaxStoresPerMemmove;
+}
+
//===----------------------------------------------------------------------===//
// TargetTransformInfo Helpers
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 153b40aac6083..e98d907350c2a 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -77,31 +77,6 @@ static cl::opt<int> MinimumJumpTables("minimum-jump-tables", cl::Hidden,
cl::init(5),
cl::desc("Set minimum jump tables"));
-static cl::opt<int>
- MaxStoresPerMemcpyCL("max-store-memcpy", cl::Hidden, cl::init(6),
- cl::desc("Max #stores to inline memcpy"));
-
-static cl::opt<int>
- MaxStoresPerMemcpyOptSizeCL("max-store-memcpy-Os", cl::Hidden, cl::init(4),
- cl::desc("Max #stores to inline memcpy"));
-
-static cl::opt<int>
- MaxStoresPerMemmoveCL("max-store-memmove", cl::Hidden, cl::init(6),
- cl::desc("Max #stores to inline memmove"));
-
-static cl::opt<int>
- MaxStoresPerMemmoveOptSizeCL("max-store-memmove-Os", cl::Hidden,
- cl::init(4),
- cl::desc("Max #stores to inline memmove"));
-
-static cl::opt<int>
- MaxStoresPerMemsetCL("max-store-memset", cl::Hidden, cl::init(8),
- cl::desc("Max #stores to inline memset"));
-
-static cl::opt<int>
- MaxStoresPerMemsetOptSizeCL("max-store-memset-Os", cl::Hidden, cl::init(4),
- cl::desc("Max #stores to inline memset"));
-
static cl::opt<bool>
ConstantLoadsToImm("constant-loads-to-imm", cl::Hidden, cl::init(true),
cl::desc("Convert constant loads to immediate values."));
@@ -1524,12 +1499,12 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
setSchedulingPreference(Sched::Source);
// Limits for inline expansion of memcpy/memmove
- MaxStoresPerMemcpy = MaxStoresPerMemcpyCL;
- MaxStoresPerMemcpyOptSize = MaxStoresPerMemcpyOptSizeCL;
- MaxStoresPerMemmove = MaxStoresPerMemmoveCL;
- MaxStoresPerMemmoveOptSize = MaxStoresPerMemmoveOptSizeCL;
- MaxStoresPerMemset = MaxStoresPerMemsetCL;
- MaxStoresPerMemsetOptSize = MaxStoresPerMemsetOptSizeCL;
+ MaxStoresPerMemcpy = 6;
+ MaxStoresPerMemcpyOptSize = 4;
+ MaxStoresPerMemmove = 6;
+ MaxStoresPerMemmoveOptSize = 4;
+ MaxStoresPerMemset = 8;
+ MaxStoresPerMemsetOptSize = 4;
//
// Set up register classes.
diff --git a/llvm/test/CodeGen/AArch64/max-stores-per-mem-ops.ll b/llvm/test/CodeGen/AArch64/max-stores-per-mem-ops.ll
new file mode 100644
index 0000000000000..260513a027a36
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/max-stores-per-mem-ops.ll
@@ -0,0 +1,501 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; Test the -max-store-memset, -max-store-memcpy, and -max-store-memmove flags,
+; which control the threshold for when memset/memcpy/memmove are inlined vs
+; calling the library function.
+
+; RUN: llc %s -o - -mtriple=aarch64 | FileCheck %s --check-prefix=DEFAULT
+; RUN: llc %s -o - -mtriple=aarch64 -max-store-memset=7 | FileCheck %s --check-prefix=MEMSET-LIMIT-7
+; RUN: llc %s -o - -mtriple=aarch64 -max-store-memset=8 | FileCheck %s --check-prefix=MEMSET-LIMIT-8
+; RUN: llc %s -o - -mtriple=aarch64 -max-store-memcpy=7 | FileCheck %s --check-prefix=MEMCPY-LIMIT-7
+; RUN: llc %s -o - -mtriple=aarch64 -max-store-memcpy=8 | FileCheck %s --check-prefix=MEMCPY-LIMIT-8
+; RUN: llc %s -o - -mtriple=aarch64 -max-store-memmove=7 | FileCheck %s --check-prefix=MEMMOVE-LIMIT-7
+; RUN: llc %s -o - -mtriple=aarch64 -max-store-memmove=8 | FileCheck %s --check-prefix=MEMMOVE-LIMIT-8
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1)
+declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1)
+
+; memset tests
+
+; Test memset with 128 bytes (the AArch64 backend counts stores in 16-bit)
+; This should be inlined by default and with -max-store-memset=8
+define void @memset_128(ptr %dst) {
+; DEFAULT-LABEL: memset_128:
+; DEFAULT: // %bb.0:
+; DEFAULT-NEXT: movi v0.2d, #0000000000000000
+; DEFAULT-NEXT: stp q0, q0, [x0]
+; DEFAULT-NEXT: stp q0, q0, [x0, #32]
+; DEFAULT-NEXT: stp q0, q0, [x0, #64]
+; DEFAULT-NEXT: stp q0, q0, [x0, #96]
+; DEFAULT-NEXT: ret
+;
+; MEMSET-LIMIT-7-LABEL: memset_128:
+; MEMSET-LIMIT-7: // %bb.0:
+; MEMSET-LIMIT-7-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMSET-LIMIT-7-NEXT: .cfi_def_cfa_offset 16
+; MEMSET-LIMIT-7-NEXT: .cfi_offset w30, -16
+; MEMSET-LIMIT-7-NEXT: mov w1, wzr
+; MEMSET-LIMIT-7-NEXT: mov w2, #128 // =0x80
+; MEMSET-LIMIT-7-NEXT: bl memset
+; MEMSET-LIMIT-7-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMSET-LIMIT-7-NEXT: ret
+;
+; MEMSET-LIMIT-8-LABEL: memset_128:
+; MEMSET-LIMIT-8: // %bb.0:
+; MEMSET-LIMIT-8-NEXT: movi v0.2d, #0000000000000000
+; MEMSET-LIMIT-8-NEXT: stp q0, q0, [x0]
+; MEMSET-LIMIT-8-NEXT: stp q0, q0, [x0, #32]
+; MEMSET-LIMIT-8-NEXT: stp q0, q0, [x0, #64]
+; MEMSET-LIMIT-8-NEXT: stp q0, q0, [x0, #96]
+; MEMSET-LIMIT-8-NEXT: ret
+;
+; MEMCPY-LIMIT-7-LABEL: memset_128:
+; MEMCPY-LIMIT-7: // %bb.0:
+; MEMCPY-LIMIT-7-NEXT: movi v0.2d, #0000000000000000
+; MEMCPY-LIMIT-7-NEXT: stp q0, q0, [x0]
+; MEMCPY-LIMIT-7-NEXT: stp q0, q0, [x0, #32]
+; MEMCPY-LIMIT-7-NEXT: stp q0, q0, [x0, #64]
+; MEMCPY-LIMIT-7-NEXT: stp q0, q0, [x0, #96]
+; MEMCPY-LIMIT-7-NEXT: ret
+;
+; MEMCPY-LIMIT-8-LABEL: memset_128:
+; MEMCPY-LIMIT-8: // %bb.0:
+; MEMCPY-LIMIT-8-NEXT: movi v0.2d, #0000000000000000
+; MEMCPY-LIMIT-8-NEXT: stp q0, q0, [x0]
+; MEMCPY-LIMIT-8-NEXT: stp q0, q0, [x0, #32]
+; MEMCPY-LIMIT-8-NEXT: stp q0, q0, [x0, #64]
+; MEMCPY-LIMIT-8-NEXT: stp q0, q0, [x0, #96]
+; MEMCPY-LIMIT-8-NEXT: ret
+;
+; MEMMOVE-LIMIT-7-LABEL: memset_128:
+; MEMMOVE-LIMIT-7: // %bb.0:
+; MEMMOVE-LIMIT-7-NEXT: movi v0.2d, #0000000000000000
+; MEMMOVE-LIMIT-7-NEXT: stp q0, q0, [x0]
+; MEMMOVE-LIMIT-7-NEXT: stp q0, q0, [x0, #32]
+; MEMMOVE-LIMIT-7-NEXT: stp q0, q0, [x0, #64]
+; MEMMOVE-LIMIT-7-NEXT: stp q0, q0, [x0, #96]
+; MEMMOVE-LIMIT-7-NEXT: ret
+;
+; MEMMOVE-LIMIT-8-LABEL: memset_128:
+; MEMMOVE-LIMIT-8: // %bb.0:
+; MEMMOVE-LIMIT-8-NEXT: movi v0.2d, #0000000000000000
+; MEMMOVE-LIMIT-8-NEXT: stp q0, q0, [x0]
+; MEMMOVE-LIMIT-8-NEXT: stp q0, q0, [x0, #32]
+; MEMMOVE-LIMIT-8-NEXT: stp q0, q0, [x0, #64]
+; MEMMOVE-LIMIT-8-NEXT: stp q0, q0, [x0, #96]
+; MEMMOVE-LIMIT-8-NEXT: ret
+ call void @llvm.memset.p0.i64(ptr align 16 %dst, i8 0, i64 128, i1 false)
+ ret void
+}
+
+; Test memset in a function with optsize attribute
+; With -max-store-memset=8, 128 bytes (8 stores) should still inline
+define void @memset_128_optsize(ptr %dst) optsize {
+; DEFAULT-LABEL: memset_128_optsize:
+; DEFAULT: // %bb.0:
+; DEFAULT-NEXT: movi v0.2d, #0000000000000000
+; DEFAULT-NEXT: stp q0, q0, [x0]
+; DEFAULT-NEXT: stp q0, q0, [x0, #32]
+; DEFAULT-NEXT: stp q0, q0, [x0, #64]
+; DEFAULT-NEXT: stp q0, q0, [x0, #96]
+; DEFAULT-NEXT: ret
+;
+; MEMSET-LIMIT-7-LABEL: memset_128_optsize:
+; MEMSET-LIMIT-7: // %bb.0:
+; MEMSET-LIMIT-7-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMSET-LIMIT-7-NEXT: .cfi_def_cfa_offset 16
+; MEMSET-LIMIT-7-NEXT: .cfi_offset w30, -16
+; MEMSET-LIMIT-7-NEXT: mov w1, wzr
+; MEMSET-LIMIT-7-NEXT: mov w2, #128 // =0x80
+; MEMSET-LIMIT-7-NEXT: bl memset
+; MEMSET-LIMIT-7-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMSET-LIMIT-7-NEXT: ret
+;
+; MEMSET-LIMIT-8-LABEL: memset_128_optsize:
+; MEMSET-LIMIT-8: // %bb.0:
+; MEMSET-LIMIT-8-NEXT: movi v0.2d, #0000000000000000
+; MEMSET-LIMIT-8-NEXT: stp q0, q0, [x0]
+; MEMSET-LIMIT-8-NEXT: stp q0, q0, [x0, #32]
+; MEMSET-LIMIT-8-NEXT: stp q0, q0, [x0, #64]
+; MEMSET-LIMIT-8-NEXT: stp q0, q0, [x0, #96]
+; MEMSET-LIMIT-8-NEXT: ret
+;
+; MEMCPY-LIMIT-7-LABEL: memset_128_optsize:
+; MEMCPY-LIMIT-7: // %bb.0:
+; MEMCPY-LIMIT-7-NEXT: movi v0.2d, #0000000000000000
+; MEMCPY-LIMIT-7-NEXT: stp q0, q0, [x0]
+; MEMCPY-LIMIT-7-NEXT: stp q0, q0, [x0, #32]
+; MEMCPY-LIMIT-7-NEXT: stp q0, q0, [x0, #64]
+; MEMCPY-LIMIT-7-NEXT: stp q0, q0, [x0, #96]
+; MEMCPY-LIMIT-7-NEXT: ret
+;
+; MEMCPY-LIMIT-8-LABEL: memset_128_optsize:
+; MEMCPY-LIMIT-8: // %bb.0:
+; MEMCPY-LIMIT-8-NEXT: movi v0.2d, #0000000000000000
+; MEMCPY-LIMIT-8-NEXT: stp q0, q0, [x0]
+; MEMCPY-LIMIT-8-NEXT: stp q0, q0, [x0, #32]
+; MEMCPY-LIMIT-8-NEXT: stp q0, q0, [x0, #64]
+; MEMCPY-LIMIT-8-NEXT: stp q0, q0, [x0, #96]
+; MEMCPY-LIMIT-8-NEXT: ret
+;
+; MEMMOVE-LIMIT-7-LABEL: memset_128_optsize:
+; MEMMOVE-LIMIT-7: // %bb.0:
+; MEMMOVE-LIMIT-7-NEXT: movi v0.2d, #0000000000000000
+; MEMMOVE-LIMIT-7-NEXT: stp q0, q0, [x0]
+; MEMMOVE-LIMIT-7-NEXT: stp q0, q0, [x0, #32]
+; MEMMOVE-LIMIT-7-NEXT: stp q0, q0, [x0, #64]
+; MEMMOVE-LIMIT-7-NEXT: stp q0, q0, [x0, #96]
+; MEMMOVE-LIMIT-7-NEXT: ret
+;
+; MEMMOVE-LIMIT-8-LABEL: memset_128_optsize:
+; MEMMOVE-LIMIT-8: // %bb.0:
+; MEMMOVE-LIMIT-8-NEXT: movi v0.2d, #0000000000000000
+; MEMMOVE-LIMIT-8-NEXT: stp q0, q0, [x0]
+; MEMMOVE-LIMIT-8-NEXT: stp q0, q0, [x0, #32]
+; MEMMOVE-LIMIT-8-NEXT: stp q0, q0, [x0, #64]
+; MEMMOVE-LIMIT-8-NEXT: stp q0, q0, [x0, #96]
+; MEMMOVE-LIMIT-8-NEXT: ret
+ call void @llvm.memset.p0.i64(ptr align 16 %dst, i8 0, i64 128, i1 false)
+ ret void
+}
+
+; memcpy tests
+
+; Test memcpy with 128 bytes (8 stores of 16 bytes each)
+; This should be inlined by default and with -max-store-memcpy=8
+; but should call memcpy library with -max-store-memcpy=7
+define void @memcpy_128(ptr %dst, ptr %src) {
+; DEFAULT-LABEL: memcpy_128:
+; DEFAULT: // %bb.0:
+; DEFAULT-NEXT: ldp q1, q0, [x1, #32]
+; DEFAULT-NEXT: ldp q2, q3, [x1]
+; DEFAULT-NEXT: stp q1, q0, [x0, #32]
+; DEFAULT-NEXT: stp q2, q3, [x0]
+; DEFAULT-NEXT: ldp q1, q0, [x1, #96]
+; DEFAULT-NEXT: ldp q2, q3, [x1, #64]
+; DEFAULT-NEXT: stp q1, q0, [x0, #96]
+; DEFAULT-NEXT: stp q2, q3, [x0, #64]
+; DEFAULT-NEXT: ret
+;
+; MEMSET-LIMIT-7-LABEL: memcpy_128:
+; MEMSET-LIMIT-7: // %bb.0:
+; MEMSET-LIMIT-7-NEXT: ldp q1, q0, [x1, #32]
+; MEMSET-LIMIT-7-NEXT: ldp q2, q3, [x1]
+; MEMSET-LIMIT-7-NEXT: stp q1, q0, [x0, #32]
+; MEMSET-LIMIT-7-NEXT: stp q2, q3, [x0]
+; MEMSET-LIMIT-7-NEXT: ldp q1, q0, [x1, #96]
+; MEMSET-LIMIT-7-NEXT: ldp q2, q3, [x1, #64]
+; MEMSET-LIMIT-7-NEXT: stp q1, q0, [x0, #96]
+; MEMSET-LIMIT-7-NEXT: stp q2, q3, [x0, #64]
+; MEMSET-LIMIT-7-NEXT: ret
+;
+; MEMSET-LIMIT-8-LABEL: memcpy_128:
+; MEMSET-LIMIT-8: // %bb.0:
+; MEMSET-LIMIT-8-NEXT: ldp q1, q0, [x1, #32]
+; MEMSET-LIMIT-8-NEXT: ldp q2, q3, [x1]
+; MEMSET-LIMIT-8-NEXT: stp q1, q0, [x0, #32]
+; MEMSET-LIMIT-8-NEXT: stp q2, q3, [x0]
+; MEMSET-LIMIT-8-NEXT: ldp q1, q0, [x1, #96]
+; MEMSET-LIMIT-8-NEXT: ldp q2, q3, [x1, #64]
+; MEMSET-LIMIT-8-NEXT: stp q1, q0, [x0, #96]
+; MEMSET-LIMIT-8-NEXT: stp q2, q3, [x0, #64]
+; MEMSET-LIMIT-8-NEXT: ret
+;
+; MEMCPY-LIMIT-7-LABEL: memcpy_128:
+; MEMCPY-LIMIT-7: // %bb.0:
+; MEMCPY-LIMIT-7-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMCPY-LIMIT-7-NEXT: .cfi_def_cfa_offset 16
+; MEMCPY-LIMIT-7-NEXT: .cfi_offset w30, -16
+; MEMCPY-LIMIT-7-NEXT: mov w2, #128 // =0x80
+; MEMCPY-LIMIT-7-NEXT: bl memcpy
+; MEMCPY-LIMIT-7-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMCPY-LIMIT-7-NEXT: ret
+;
+; MEMCPY-LIMIT-8-LABEL: memcpy_128:
+; MEMCPY-LIMIT-8: // %bb.0:
+; MEMCPY-LIMIT-8-NEXT: ldp q1, q0, [x1, #32]
+; MEMCPY-LIMIT-8-NEXT: ldp q2, q3, [x1]
+; MEMCPY-LIMIT-8-NEXT: stp q1, q0, [x0, #32]
+; MEMCPY-LIMIT-8-NEXT: stp q2, q3, [x0]
+; MEMCPY-LIMIT-8-NEXT: ldp q1, q0, [x1, #96]
+; MEMCPY-LIMIT-8-NEXT: ldp q2, q3, [x1, #64]
+; MEMCPY-LIMIT-8-NEXT: stp q1, q0, [x0, #96]
+; MEMCPY-LIMIT-8-NEXT: stp q2, q3, [x0, #64]
+; MEMCPY-LIMIT-8-NEXT: ret
+;
+; MEMMOVE-LIMIT-7-LABEL: memcpy_128:
+; MEMMOVE-LIMIT-7: // %bb.0:
+; MEMMOVE-LIMIT-7-NEXT: ldp q1, q0, [x1, #32]
+; MEMMOVE-LIMIT-7-NEXT: ldp q2, q3, [x1]
+; MEMMOVE-LIMIT-7-NEXT: stp q1, q0, [x0, #32]
+; MEMMOVE-LIMIT-7-NEXT: stp q2, q3, [x0]
+; MEMMOVE-LIMIT-7-NEXT: ldp q1, q0, [x1, #96]
+; MEMMOVE-LIMIT-7-NEXT: ldp q2, q3, [x1, #64]
+; MEMMOVE-LIMIT-7-NEXT: stp q1, q0, [x0, #96]
+; MEMMOVE-LIMIT-7-NEXT: stp q2, q3, [x0, #64]
+; MEMMOVE-LIMIT-7-NEXT: ret
+;
+; MEMMOVE-LIMIT-8-LABEL: memcpy_128:
+; MEMMOVE-LIMIT-8: // %bb.0:
+; MEMMOVE-LIMIT-8-NEXT: ldp q1, q0, [x1, #32]
+; MEMMOVE-LIMIT-8-NEXT: ldp q2, q3, [x1]
+; MEMMOVE-LIMIT-8-NEXT: stp q1, q0, [x0, #32]
+; MEMMOVE-LIMIT-8-NEXT: stp q2, q3, [x0]
+; MEMMOVE-LIMIT-8-NEXT: ldp q1, q0, [x1, #96]
+; MEMMOVE-LIMIT-8-NEXT: ldp q2, q3, [x1, #64]
+; MEMMOVE-LIMIT-8-NEXT: stp q1, q0, [x0, #96]
+; MEMMOVE-LIMIT-8-NEXT: stp q2, q3, [x0, #64]
+; MEMMOVE-LIMIT-8-NEXT: ret
+ call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 128, i1 false)
+ ret void
+}
+
+; Test optsize memcpy with 128 bytes (8 stores of 16 bytes each)
+; This should be inlined with -max-store-memcpy=8
+; but should call memcpy library with -max-store-memcpy=7
+define void @memcpy_128_optsize(ptr %dst, ptr %src) optsize {
+; DEFAULT-LABEL: memcpy_128_optsize:
+; DEFAULT: // %bb.0:
+; DEFAULT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; DEFAULT-NEXT: .cfi_def_cfa_offset 16
+; DEFAULT-NEXT: .cfi_offset w30, -16
+; DEFAULT-NEXT: mov w2, #128 // =0x80
+; DEFAULT-NEXT: bl memcpy
+; DEFAULT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; DEFAULT-NEXT: ret
+;
+; MEMSET-LIMIT-7-LABEL: memcpy_128_optsize:
+; MEMSET-LIMIT-7: // %bb.0:
+; MEMSET-LIMIT-7-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMSET-LIMIT-7-NEXT: .cfi_def_cfa_offset 16
+; MEMSET-LIMIT-7-NEXT: .cfi_offset w30, -16
+; MEMSET-LIMIT-7-NEXT: mov w2, #128 // =0x80
+; MEMSET-LIMIT-7-NEXT: bl memcpy
+; MEMSET-LIMIT-7-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMSET-LIMIT-7-NEXT: ret
+;
+; MEMSET-LIMIT-8-LABEL: memcpy_128_optsize:
+; MEMSET-LIMIT-8: // %bb.0:
+; MEMSET-LIMIT-8-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMSET-LIMIT-8-NEXT: .cfi_def_cfa_offset 16
+; MEMSET-LIMIT-8-NEXT: .cfi_offset w30, -16
+; MEMSET-LIMIT-8-NEXT: mov w2, #128 // =0x80
+; MEMSET-LIMIT-8-NEXT: bl memcpy
+; MEMSET-LIMIT-8-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMSET-LIMIT-8-NEXT: ret
+;
+; MEMCPY-LIMIT-7-LABEL: memcpy_128_optsize:
+; MEMCPY-LIMIT-7: // %bb.0:
+; MEMCPY-LIMIT-7-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMCPY-LIMIT-7-NEXT: .cfi_def_cfa_offset 16
+; MEMCPY-LIMIT-7-NEXT: .cfi_offset w30, -16
+; MEMCPY-LIMIT-7-NEXT: mov w2, #128 // =0x80
+; MEMCPY-LIMIT-7-NEXT: bl memcpy
+; MEMCPY-LIMIT-7-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMCPY-LIMIT-7-NEXT: ret
+;
+; MEMCPY-LIMIT-8-LABEL: memcpy_128_optsize:
+; MEMCPY-LIMIT-8: // %bb.0:
+; MEMCPY-LIMIT-8-NEXT: ldp q1, q0, [x1, #32]
+; MEMCPY-LIMIT-8-NEXT: ldp q2, q3, [x1]
+; MEMCPY-LIMIT-8-NEXT: stp q1, q0, [x0, #32]
+; MEMCPY-LIMIT-8-NEXT: stp q2, q3, [x0]
+; MEMCPY-LIMIT-8-NEXT: ldp q1, q0, [x1, #96]
+; MEMCPY-LIMIT-8-NEXT: ldp q2, q3, [x1, #64]
+; MEMCPY-LIMIT-8-NEXT: stp q1, q0, [x0, #96]
+; MEMCPY-LIMIT-8-NEXT: stp q2, q3, [x0, #64]
+; MEMCPY-LIMIT-8-NEXT: ret
+;
+; MEMMOVE-LIMIT-7-LABEL: memcpy_128_optsize:
+; MEMMOVE-LIMIT-7: // %bb.0:
+; MEMMOVE-LIMIT-7-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMMOVE-LIMIT-7-NEXT: .cfi_def_cfa_offset 16
+; MEMMOVE-LIMIT-7-NEXT: .cfi_offset w30, -16
+; MEMMOVE-LIMIT-7-NEXT: mov w2, #128 // =0x80
+; MEMMOVE-LIMIT-7-NEXT: bl memcpy
+; MEMMOVE-LIMIT-7-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMMOVE-LIMIT-7-NEXT: ret
+;
+; MEMMOVE-LIMIT-8-LABEL: memcpy_128_optsize:
+; MEMMOVE-LIMIT-8: // %bb.0:
+; MEMMOVE-LIMIT-8-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMMOVE-LIMIT-8-NEXT: .cfi_def_cfa_offset 16
+; MEMMOVE-LIMIT-8-NEXT: .cfi_offset w30, -16
+; MEMMOVE-LIMIT-8-NEXT: mov w2, #128 // =0x80
+; MEMMOVE-LIMIT-8-NEXT: bl memcpy
+; MEMMOVE-LIMIT-8-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMMOVE-LIMIT-8-NEXT: ret
+ call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 128, i1 false)
+ ret void
+}
+
+; memmove tests
+
+; Test memmove with 128 bytes (8 stores of 16 bytes each)
+; This should be inlined by default and with -max-store-memmove=8
+; but should call memmove library with -max-store-memmove=7
+define void @memmove_128(ptr %dst, ptr %src) {
+; DEFAULT-LABEL: memmove_128:
+; DEFAULT: // %bb.0:
+; DEFAULT-NEXT: ldp q0, q1, [x1, #96]
+; DEFAULT-NEXT: ldp q2, q3, [x1]
+; DEFAULT-NEXT: ldp q4, q5, [x1, #32]
+; DEFAULT-NEXT: ldp q6, q7, [x1, #64]
+; DEFAULT-NEXT: stp q0, q1, [x0, #96]
+; DEFAULT-NEXT: stp q2, q3, [x0]
+; DEFAULT-NEXT: stp q4, q5, [x0, #32]
+; DEFAULT-NEXT: stp q6, q7, [x0, #64]
+; DEFAULT-NEXT: ret
+;
+; MEMSET-LIMIT-7-LABEL: memmove_128:
+; MEMSET-LIMIT-7: // %bb.0:
+; MEMSET-LIMIT-7-NEXT: ldp q0, q1, [x1, #96]
+; MEMSET-LIMIT-7-NEXT: ldp q2, q3, [x1]
+; MEMSET-LIMIT-7-NEXT: ldp q4, q5, [x1, #32]
+; MEMSET-LIMIT-7-NEXT: ldp q6, q7, [x1, #64]
+; MEMSET-LIMIT-7-NEXT: stp q0, q1, [x0, #96]
+; MEMSET-LIMIT-7-NEXT: stp q2, q3, [x0]
+; MEMSET-LIMIT-7-NEXT: stp q4, q5, [x0, #32]
+; MEMSET-LIMIT-7-NEXT: stp q6, q7, [x0, #64]
+; MEMSET-LIMIT-7-NEXT: ret
+;
+; MEMSET-LIMIT-8-LABEL: memmove_128:
+; MEMSET-LIMIT-8: // %bb.0:
+; MEMSET-LIMIT-8-NEXT: ldp q0, q1, [x1, #96]
+; MEMSET-LIMIT-8-NEXT: ldp q2, q3, [x1]
+; MEMSET-LIMIT-8-NEXT: ldp q4, q5, [x1, #32]
+; MEMSET-LIMIT-8-NEXT: ldp q6, q7, [x1, #64]
+; MEMSET-LIMIT-8-NEXT: stp q0, q1, [x0, #96]
+; MEMSET-LIMIT-8-NEXT: stp q2, q3, [x0]
+; MEMSET-LIMIT-8-NEXT: stp q4, q5, [x0, #32]
+; MEMSET-LIMIT-8-NEXT: stp q6, q7, [x0, #64]
+; MEMSET-LIMIT-8-NEXT: ret
+;
+; MEMCPY-LIMIT-7-LABEL: memmove_128:
+; MEMCPY-LIMIT-7: // %bb.0:
+; MEMCPY-LIMIT-7-NEXT: ldp q0, q1, [x1, #96]
+; MEMCPY-LIMIT-7-NEXT: ldp q2, q3, [x1]
+; MEMCPY-LIMIT-7-NEXT: ldp q4, q5, [x1, #32]
+; MEMCPY-LIMIT-7-NEXT: ldp q6, q7, [x1, #64]
+; MEMCPY-LIMIT-7-NEXT: stp q0, q1, [x0, #96]
+; MEMCPY-LIMIT-7-NEXT: stp q2, q3, [x0]
+; MEMCPY-LIMIT-7-NEXT: stp q4, q5, [x0, #32]
+; MEMCPY-LIMIT-7-NEXT: stp q6, q7, [x0, #64]
+; MEMCPY-LIMIT-7-NEXT: ret
+;
+; MEMCPY-LIMIT-8-LABEL: memmove_128:
+; MEMCPY-LIMIT-8: // %bb.0:
+; MEMCPY-LIMIT-8-NEXT: ldp q0, q1, [x1, #96]
+; MEMCPY-LIMIT-8-NEXT: ldp q2, q3, [x1]
+; MEMCPY-LIMIT-8-NEXT: ldp q4, q5, [x1, #32]
+; MEMCPY-LIMIT-8-NEXT: ldp q6, q7, [x1, #64]
+; MEMCPY-LIMIT-8-NEXT: stp q0, q1, [x0, #96]
+; MEMCPY-LIMIT-8-NEXT: stp q2, q3, [x0]
+; MEMCPY-LIMIT-8-NEXT: stp q4, q5, [x0, #32]
+; MEMCPY-LIMIT-8-NEXT: stp q6, q7, [x0, #64]
+; MEMCPY-LIMIT-8-NEXT: ret
+;
+; MEMMOVE-LIMIT-7-LABEL: memmove_128:
+; MEMMOVE-LIMIT-7: // %bb.0:
+; MEMMOVE-LIMIT-7-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMMOVE-LIMIT-7-NEXT: .cfi_def_cfa_offset 16
+; MEMMOVE-LIMIT-7-NEXT: .cfi_offset w30, -16
+; MEMMOVE-LIMIT-7-NEXT: mov w2, #128 // =0x80
+; MEMMOVE-LIMIT-7-NEXT: bl memmove
+; MEMMOVE-LIMIT-7-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMMOVE-LIMIT-7-NEXT: ret
+;
+; MEMMOVE-LIMIT-8-LABEL: memmove_128:
+; MEMMOVE-LIMIT-8: // %bb.0:
+; MEMMOVE-LIMIT-8-NEXT: ldp q0, q1, [x1, #96]
+; MEMMOVE-LIMIT-8-NEXT: ldp q2, q3, [x1]
+; MEMMOVE-LIMIT-8-NEXT: ldp q4, q5, [x1, #32]
+; MEMMOVE-LIMIT-8-NEXT: ldp q6, q7, [x1, #64]
+; MEMMOVE-LIMIT-8-NEXT: stp q0, q1, [x0, #96]
+; MEMMOVE-LIMIT-8-NEXT: stp q2, q3, [x0]
+; MEMMOVE-LIMIT-8-NEXT: stp q4, q5, [x0, #32]
+; MEMMOVE-LIMIT-8-NEXT: stp q6, q7, [x0, #64]
+; MEMMOVE-LIMIT-8-NEXT: ret
+ call void @llvm.memmove.p0.p0.i64(ptr %dst, ptr %src, i64 128, i1 false)
+ ret void
+}
+
+; Test optsize memmove with 128 bytes (8 stores of 16 bytes each)
+; This should be inlined with -max-store-memmove=8
+; but should call memmove library with -max-store-memmove=7
+define void @memmove_128_optsize(ptr %dst, ptr %src) optsize {
+; DEFAULT-LABEL: memmove_128_optsize:
+; DEFAULT: // %bb.0:
+; DEFAULT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; DEFAULT-NEXT: .cfi_def_cfa_offset 16
+; DEFAULT-NEXT: .cfi_offset w30, -16
+; DEFAULT-NEXT: mov w2, #128 // =0x80
+; DEFAULT-NEXT: bl memmove
+; DEFAULT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; DEFAULT-NEXT: ret
+;
+; MEMSET-LIMIT-7-LABEL: memmove_128_optsize:
+; MEMSET-LIMIT-7: // %bb.0:
+; MEMSET-LIMIT-7-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMSET-LIMIT-7-NEXT: .cfi_def_cfa_offset 16
+; MEMSET-LIMIT-7-NEXT: .cfi_offset w30, -16
+; MEMSET-LIMIT-7-NEXT: mov w2, #128 // =0x80
+; MEMSET-LIMIT-7-NEXT: bl memmove
+; MEMSET-LIMIT-7-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMSET-LIMIT-7-NEXT: ret
+;
+; MEMSET-LIMIT-8-LABEL: memmove_128_optsize:
+; MEMSET-LIMIT-8: // %bb.0:
+; MEMSET-LIMIT-8-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMSET-LIMIT-8-NEXT: .cfi_def_cfa_offset 16
+; MEMSET-LIMIT-8-NEXT: .cfi_offset w30, -16
+; MEMSET-LIMIT-8-NEXT: mov w2, #128 // =0x80
+; MEMSET-LIMIT-8-NEXT: bl memmove
+; MEMSET-LIMIT-8-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMSET-LIMIT-8-NEXT: ret
+;
+; MEMCPY-LIMIT-7-LABEL: memmove_128_optsize:
+; MEMCPY-LIMIT-7: // %bb.0:
+; MEMCPY-LIMIT-7-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMCPY-LIMIT-7-NEXT: .cfi_def_cfa_offset 16
+; MEMCPY-LIMIT-7-NEXT: .cfi_offset w30, -16
+; MEMCPY-LIMIT-7-NEXT: mov w2, #128 // =0x80
+; MEMCPY-LIMIT-7-NEXT: bl memmove
+; MEMCPY-LIMIT-7-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMCPY-LIMIT-7-NEXT: ret
+;
+; MEMCPY-LIMIT-8-LABEL: memmove_128_optsize:
+; MEMCPY-LIMIT-8: // %bb.0:
+; MEMCPY-LIMIT-8-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMCPY-LIMIT-8-NEXT: .cfi_def_cfa_offset 16
+; MEMCPY-LIMIT-8-NEXT: .cfi_offset w30, -16
+; MEMCPY-LIMIT-8-NEXT: mov w2, #128 // =0x80
+; MEMCPY-LIMIT-8-NEXT: bl memmove
+; MEMCPY-LIMIT-8-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMCPY-LIMIT-8-NEXT: ret
+;
+; MEMMOVE-LIMIT-7-LABEL: memmove_128_optsize:
+; MEMMOVE-LIMIT-7: // %bb.0:
+; MEMMOVE-LIMIT-7-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMMOVE-LIMIT-7-NEXT: .cfi_def_cfa_offset 16
+; MEMMOVE-LIMIT-7-NEXT: .cfi_offset w30, -16
+; MEMMOVE-LIMIT-7-NEXT: mov w2, #128 // =0x80
+; MEMMOVE-LIMIT-7-NEXT: bl memmove
+; MEMMOVE-LIMIT-7-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMMOVE-LIMIT-7-NEXT: ret
+;
+; MEMMOVE-LIMIT-8-LABEL: memmove_128_optsize:
+; MEMMOVE-LIMIT-8: // %bb.0:
+; MEMMOVE-LIMIT-8-NEXT: ldp q0, q1, [x1, #96]
+; MEMMOVE-LIMIT-8-NEXT: ldp q2, q3, [x1]
+; MEMMOVE-LIMIT-8-NEXT: ldp q4, q5, [x1, #32]
+; MEMMOVE-LIMIT-8-NEXT: ldp q6, q7, [x1, #64]
+; MEMMOVE-LIMIT-8-NEXT: stp q0, q1, [x0, #96]
+; MEMMOVE-LIMIT-8-NEXT: stp q2, q3, [x0]
+; MEMMOVE-LIMIT-8-NEXT: stp q4, q5, [x0, #32]
+; MEMMOVE-LIMIT-8-NEXT: stp q6, q7, [x0, #64]
+; MEMMOVE-LIMIT-8-NEXT: ret
+ call void @llvm.memmove.p0.p0.i64(ptr %dst, ptr %src, i64 128, i1 false)
+ ret void
+}
More information about the llvm-commits
mailing list