[llvm] b28eeb2 - [CodeGen] Generalise Hexagon flags for memop inline thresholds (#172829)

via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 9 04:08:40 PST 2026


Author: Ties Stuij
Date: 2026-01-09T12:08:35Z
New Revision: b28eeb28bea39148738dc375e8a97072a1907e64

URL: https://github.com/llvm/llvm-project/commit/b28eeb28bea39148738dc375e8a97072a1907e64
DIFF: https://github.com/llvm/llvm-project/commit/b28eeb28bea39148738dc375e8a97072a1907e64.diff

LOG: [CodeGen] Generalise Hexagon flags for memop inline thresholds (#172829)

Generalise the Hexagon cmdline options to control if memset, memcpy or memmove intrinsics should be inlined versus calling library functions, so they can be used by all backends:

	•	-max-store-memset
	•	-max-store-memcpy
	•	-max-store-memmove

These flags override the target-specific defaults set in TargetLowering (e.g., MaxStoresPerMemcpy) and allow fine-tuning of the inlining threshold for performance analysis and optimization.

The optsize variants (-max-store-memset-Os, -max-store-memcpy-Os, max-store-memmove-Os) from the Hexagon backend were removed, and now the above options control both.

The threshold is specified as a number of store operations, which is backend-specific. Operations requiring more stores than the threshold will call the corresponding library function instead of being inlined.

Added: 
    llvm/test/CodeGen/AArch64/max-stores-per-mem-ops.ll

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/TargetLoweringBase.cpp
    llvm/lib/Target/Hexagon/HexagonISelLowering.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 8c01c58a0318f..080501bc5dff9 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1937,9 +1937,7 @@ class LLVM_ABI TargetLoweringBase {
   /// to replace a call to llvm.memset. The value is set by the target at the
   /// performance threshold for such a replacement. If OptSize is true,
   /// return the limit for functions that have OptSize attribute.
-  unsigned getMaxStoresPerMemset(bool OptSize) const {
-    return OptSize ? MaxStoresPerMemsetOptSize : MaxStoresPerMemset;
-  }
+  unsigned getMaxStoresPerMemset(bool OptSize) const;
 
   /// Get maximum # of store operations permitted for llvm.memcpy
   ///
@@ -1947,9 +1945,7 @@ class LLVM_ABI TargetLoweringBase {
   /// to replace a call to llvm.memcpy. The value is set by the target at the
   /// performance threshold for such a replacement. If OptSize is true,
   /// return the limit for functions that have OptSize attribute.
-  unsigned getMaxStoresPerMemcpy(bool OptSize) const {
-    return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
-  }
+  unsigned getMaxStoresPerMemcpy(bool OptSize) const;
 
   /// \brief Get maximum # of store operations to be glued together
   ///
@@ -1976,9 +1972,7 @@ class LLVM_ABI TargetLoweringBase {
   /// to replace a call to llvm.memmove. The value is set by the target at the
   /// performance threshold for such a replacement. If OptSize is true,
   /// return the limit for functions that have OptSize attribute.
-  unsigned getMaxStoresPerMemmove(bool OptSize) const {
-    return OptSize ? MaxStoresPerMemmoveOptSize : MaxStoresPerMemmove;
-  }
+  unsigned getMaxStoresPerMemmove(bool OptSize) const;
 
   /// Determine if the target supports unaligned memory accesses.
   ///

diff  --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index d7a8adc9b9d2b..c6aaea412a760 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -95,6 +95,24 @@ static cl::opt<unsigned> MinimumBitTestCmpsOverride(
     cl::desc("Set minimum of largest number of comparisons "
              "to use bit test for switch."));
 
+static cl::opt<unsigned> MaxStoresPerMemsetOverride(
+    "max-store-memset", cl::init(0), cl::Hidden,
+    cl::desc("Override target's MaxStoresPerMemset and "
+             "MaxStoresPerMemsetOptSize. "
+             "Set to 0 to use the target default."));
+
+static cl::opt<unsigned> MaxStoresPerMemcpyOverride(
+    "max-store-memcpy", cl::init(0), cl::Hidden,
+    cl::desc("Override target's MaxStoresPerMemcpy and "
+             "MaxStoresPerMemcpyOptSize. "
+             "Set to 0 to use the target default."));
+
+static cl::opt<unsigned> MaxStoresPerMemmoveOverride(
+    "max-store-memmove", cl::init(0), cl::Hidden,
+    cl::desc("Override target's MaxStoresPerMemmove and "
+             "MaxStoresPerMemmoveOptSize. "
+             "Set to 0 to use the target default."));
+
 // FIXME: This option is only to test if the strict fp operation processed
 // correctly by preventing mutating strict fp operation to normal fp operation
 // during development. When the backend supports strict float operation, this
@@ -2116,6 +2134,27 @@ bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
                             MMO.getFlags(), Fast);
 }
 
+unsigned TargetLoweringBase::getMaxStoresPerMemset(bool OptSize) const {
+  if (MaxStoresPerMemsetOverride > 0)
+    return MaxStoresPerMemsetOverride;
+
+  return OptSize ? MaxStoresPerMemsetOptSize : MaxStoresPerMemset;
+}
+
+unsigned TargetLoweringBase::getMaxStoresPerMemcpy(bool OptSize) const {
+  if (MaxStoresPerMemcpyOverride > 0)
+    return MaxStoresPerMemcpyOverride;
+
+  return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
+}
+
+unsigned TargetLoweringBase::getMaxStoresPerMemmove(bool OptSize) const {
+  if (MaxStoresPerMemmoveOverride > 0)
+    return MaxStoresPerMemmoveOverride;
+
+  return OptSize ? MaxStoresPerMemmoveOptSize : MaxStoresPerMemmove;
+}
+
 //===----------------------------------------------------------------------===//
 //  TargetTransformInfo Helpers
 //===----------------------------------------------------------------------===//

diff  --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 153b40aac6083..e98d907350c2a 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -77,31 +77,6 @@ static cl::opt<int> MinimumJumpTables("minimum-jump-tables", cl::Hidden,
                                       cl::init(5),
                                       cl::desc("Set minimum jump tables"));
 
-static cl::opt<int>
-    MaxStoresPerMemcpyCL("max-store-memcpy", cl::Hidden, cl::init(6),
-                         cl::desc("Max #stores to inline memcpy"));
-
-static cl::opt<int>
-    MaxStoresPerMemcpyOptSizeCL("max-store-memcpy-Os", cl::Hidden, cl::init(4),
-                                cl::desc("Max #stores to inline memcpy"));
-
-static cl::opt<int>
-    MaxStoresPerMemmoveCL("max-store-memmove", cl::Hidden, cl::init(6),
-                          cl::desc("Max #stores to inline memmove"));
-
-static cl::opt<int>
-    MaxStoresPerMemmoveOptSizeCL("max-store-memmove-Os", cl::Hidden,
-                                 cl::init(4),
-                                 cl::desc("Max #stores to inline memmove"));
-
-static cl::opt<int>
-    MaxStoresPerMemsetCL("max-store-memset", cl::Hidden, cl::init(8),
-                         cl::desc("Max #stores to inline memset"));
-
-static cl::opt<int>
-    MaxStoresPerMemsetOptSizeCL("max-store-memset-Os", cl::Hidden, cl::init(4),
-                                cl::desc("Max #stores to inline memset"));
-
 static cl::opt<bool>
     ConstantLoadsToImm("constant-loads-to-imm", cl::Hidden, cl::init(true),
                        cl::desc("Convert constant loads to immediate values."));
@@ -1524,12 +1499,12 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setSchedulingPreference(Sched::Source);
 
   // Limits for inline expansion of memcpy/memmove
-  MaxStoresPerMemcpy = MaxStoresPerMemcpyCL;
-  MaxStoresPerMemcpyOptSize = MaxStoresPerMemcpyOptSizeCL;
-  MaxStoresPerMemmove = MaxStoresPerMemmoveCL;
-  MaxStoresPerMemmoveOptSize = MaxStoresPerMemmoveOptSizeCL;
-  MaxStoresPerMemset = MaxStoresPerMemsetCL;
-  MaxStoresPerMemsetOptSize = MaxStoresPerMemsetOptSizeCL;
+  MaxStoresPerMemcpy = 6;
+  MaxStoresPerMemcpyOptSize = 4;
+  MaxStoresPerMemmove = 6;
+  MaxStoresPerMemmoveOptSize = 4;
+  MaxStoresPerMemset = 8;
+  MaxStoresPerMemsetOptSize = 4;
 
   //
   // Set up register classes.

diff  --git a/llvm/test/CodeGen/AArch64/max-stores-per-mem-ops.ll b/llvm/test/CodeGen/AArch64/max-stores-per-mem-ops.ll
new file mode 100644
index 0000000000000..260513a027a36
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/max-stores-per-mem-ops.ll
@@ -0,0 +1,501 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; Test the -max-store-memset, -max-store-memcpy, and -max-store-memmove flags,
+; which control the threshold for when memset/memcpy/memmove are inlined vs
+; calling the library function.
+
+; RUN: llc %s -o - -mtriple=aarch64 | FileCheck %s --check-prefix=DEFAULT
+; RUN: llc %s -o - -mtriple=aarch64 -max-store-memset=7 | FileCheck %s --check-prefix=MEMSET-LIMIT-7
+; RUN: llc %s -o - -mtriple=aarch64 -max-store-memset=8 | FileCheck %s --check-prefix=MEMSET-LIMIT-8
+; RUN: llc %s -o - -mtriple=aarch64 -max-store-memcpy=7 | FileCheck %s --check-prefix=MEMCPY-LIMIT-7
+; RUN: llc %s -o - -mtriple=aarch64 -max-store-memcpy=8 | FileCheck %s --check-prefix=MEMCPY-LIMIT-8
+; RUN: llc %s -o - -mtriple=aarch64 -max-store-memmove=7 | FileCheck %s --check-prefix=MEMMOVE-LIMIT-7
+; RUN: llc %s -o - -mtriple=aarch64 -max-store-memmove=8 | FileCheck %s --check-prefix=MEMMOVE-LIMIT-8
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1)
+declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1)
+
+; memset tests
+
+; Test memset with 128 bytes (the AArch64 backend counts stores in 16-bit)
+; This should be inlined by default and with -max-store-memset=8
+define void @memset_128(ptr %dst) {
+; DEFAULT-LABEL: memset_128:
+; DEFAULT:       // %bb.0:
+; DEFAULT-NEXT:    movi v0.2d, #0000000000000000
+; DEFAULT-NEXT:    stp q0, q0, [x0]
+; DEFAULT-NEXT:    stp q0, q0, [x0, #32]
+; DEFAULT-NEXT:    stp q0, q0, [x0, #64]
+; DEFAULT-NEXT:    stp q0, q0, [x0, #96]
+; DEFAULT-NEXT:    ret
+;
+; MEMSET-LIMIT-7-LABEL: memset_128:
+; MEMSET-LIMIT-7:       // %bb.0:
+; MEMSET-LIMIT-7-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMSET-LIMIT-7-NEXT:    .cfi_def_cfa_offset 16
+; MEMSET-LIMIT-7-NEXT:    .cfi_offset w30, -16
+; MEMSET-LIMIT-7-NEXT:    mov w1, wzr
+; MEMSET-LIMIT-7-NEXT:    mov w2, #128 // =0x80
+; MEMSET-LIMIT-7-NEXT:    bl memset
+; MEMSET-LIMIT-7-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMSET-LIMIT-7-NEXT:    ret
+;
+; MEMSET-LIMIT-8-LABEL: memset_128:
+; MEMSET-LIMIT-8:       // %bb.0:
+; MEMSET-LIMIT-8-NEXT:    movi v0.2d, #0000000000000000
+; MEMSET-LIMIT-8-NEXT:    stp q0, q0, [x0]
+; MEMSET-LIMIT-8-NEXT:    stp q0, q0, [x0, #32]
+; MEMSET-LIMIT-8-NEXT:    stp q0, q0, [x0, #64]
+; MEMSET-LIMIT-8-NEXT:    stp q0, q0, [x0, #96]
+; MEMSET-LIMIT-8-NEXT:    ret
+;
+; MEMCPY-LIMIT-7-LABEL: memset_128:
+; MEMCPY-LIMIT-7:       // %bb.0:
+; MEMCPY-LIMIT-7-NEXT:    movi v0.2d, #0000000000000000
+; MEMCPY-LIMIT-7-NEXT:    stp q0, q0, [x0]
+; MEMCPY-LIMIT-7-NEXT:    stp q0, q0, [x0, #32]
+; MEMCPY-LIMIT-7-NEXT:    stp q0, q0, [x0, #64]
+; MEMCPY-LIMIT-7-NEXT:    stp q0, q0, [x0, #96]
+; MEMCPY-LIMIT-7-NEXT:    ret
+;
+; MEMCPY-LIMIT-8-LABEL: memset_128:
+; MEMCPY-LIMIT-8:       // %bb.0:
+; MEMCPY-LIMIT-8-NEXT:    movi v0.2d, #0000000000000000
+; MEMCPY-LIMIT-8-NEXT:    stp q0, q0, [x0]
+; MEMCPY-LIMIT-8-NEXT:    stp q0, q0, [x0, #32]
+; MEMCPY-LIMIT-8-NEXT:    stp q0, q0, [x0, #64]
+; MEMCPY-LIMIT-8-NEXT:    stp q0, q0, [x0, #96]
+; MEMCPY-LIMIT-8-NEXT:    ret
+;
+; MEMMOVE-LIMIT-7-LABEL: memset_128:
+; MEMMOVE-LIMIT-7:       // %bb.0:
+; MEMMOVE-LIMIT-7-NEXT:    movi v0.2d, #0000000000000000
+; MEMMOVE-LIMIT-7-NEXT:    stp q0, q0, [x0]
+; MEMMOVE-LIMIT-7-NEXT:    stp q0, q0, [x0, #32]
+; MEMMOVE-LIMIT-7-NEXT:    stp q0, q0, [x0, #64]
+; MEMMOVE-LIMIT-7-NEXT:    stp q0, q0, [x0, #96]
+; MEMMOVE-LIMIT-7-NEXT:    ret
+;
+; MEMMOVE-LIMIT-8-LABEL: memset_128:
+; MEMMOVE-LIMIT-8:       // %bb.0:
+; MEMMOVE-LIMIT-8-NEXT:    movi v0.2d, #0000000000000000
+; MEMMOVE-LIMIT-8-NEXT:    stp q0, q0, [x0]
+; MEMMOVE-LIMIT-8-NEXT:    stp q0, q0, [x0, #32]
+; MEMMOVE-LIMIT-8-NEXT:    stp q0, q0, [x0, #64]
+; MEMMOVE-LIMIT-8-NEXT:    stp q0, q0, [x0, #96]
+; MEMMOVE-LIMIT-8-NEXT:    ret
+  call void @llvm.memset.p0.i64(ptr align 16 %dst, i8 0, i64 128, i1 false)
+  ret void
+}
+
+; Test memset in a function with optsize attribute
+; With -max-store-memset=8, 128 bytes (8 stores) should still inline
+define void @memset_128_optsize(ptr %dst) optsize {
+; DEFAULT-LABEL: memset_128_optsize:
+; DEFAULT:       // %bb.0:
+; DEFAULT-NEXT:    movi v0.2d, #0000000000000000
+; DEFAULT-NEXT:    stp q0, q0, [x0]
+; DEFAULT-NEXT:    stp q0, q0, [x0, #32]
+; DEFAULT-NEXT:    stp q0, q0, [x0, #64]
+; DEFAULT-NEXT:    stp q0, q0, [x0, #96]
+; DEFAULT-NEXT:    ret
+;
+; MEMSET-LIMIT-7-LABEL: memset_128_optsize:
+; MEMSET-LIMIT-7:       // %bb.0:
+; MEMSET-LIMIT-7-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMSET-LIMIT-7-NEXT:    .cfi_def_cfa_offset 16
+; MEMSET-LIMIT-7-NEXT:    .cfi_offset w30, -16
+; MEMSET-LIMIT-7-NEXT:    mov w1, wzr
+; MEMSET-LIMIT-7-NEXT:    mov w2, #128 // =0x80
+; MEMSET-LIMIT-7-NEXT:    bl memset
+; MEMSET-LIMIT-7-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMSET-LIMIT-7-NEXT:    ret
+;
+; MEMSET-LIMIT-8-LABEL: memset_128_optsize:
+; MEMSET-LIMIT-8:       // %bb.0:
+; MEMSET-LIMIT-8-NEXT:    movi v0.2d, #0000000000000000
+; MEMSET-LIMIT-8-NEXT:    stp q0, q0, [x0]
+; MEMSET-LIMIT-8-NEXT:    stp q0, q0, [x0, #32]
+; MEMSET-LIMIT-8-NEXT:    stp q0, q0, [x0, #64]
+; MEMSET-LIMIT-8-NEXT:    stp q0, q0, [x0, #96]
+; MEMSET-LIMIT-8-NEXT:    ret
+;
+; MEMCPY-LIMIT-7-LABEL: memset_128_optsize:
+; MEMCPY-LIMIT-7:       // %bb.0:
+; MEMCPY-LIMIT-7-NEXT:    movi v0.2d, #0000000000000000
+; MEMCPY-LIMIT-7-NEXT:    stp q0, q0, [x0]
+; MEMCPY-LIMIT-7-NEXT:    stp q0, q0, [x0, #32]
+; MEMCPY-LIMIT-7-NEXT:    stp q0, q0, [x0, #64]
+; MEMCPY-LIMIT-7-NEXT:    stp q0, q0, [x0, #96]
+; MEMCPY-LIMIT-7-NEXT:    ret
+;
+; MEMCPY-LIMIT-8-LABEL: memset_128_optsize:
+; MEMCPY-LIMIT-8:       // %bb.0:
+; MEMCPY-LIMIT-8-NEXT:    movi v0.2d, #0000000000000000
+; MEMCPY-LIMIT-8-NEXT:    stp q0, q0, [x0]
+; MEMCPY-LIMIT-8-NEXT:    stp q0, q0, [x0, #32]
+; MEMCPY-LIMIT-8-NEXT:    stp q0, q0, [x0, #64]
+; MEMCPY-LIMIT-8-NEXT:    stp q0, q0, [x0, #96]
+; MEMCPY-LIMIT-8-NEXT:    ret
+;
+; MEMMOVE-LIMIT-7-LABEL: memset_128_optsize:
+; MEMMOVE-LIMIT-7:       // %bb.0:
+; MEMMOVE-LIMIT-7-NEXT:    movi v0.2d, #0000000000000000
+; MEMMOVE-LIMIT-7-NEXT:    stp q0, q0, [x0]
+; MEMMOVE-LIMIT-7-NEXT:    stp q0, q0, [x0, #32]
+; MEMMOVE-LIMIT-7-NEXT:    stp q0, q0, [x0, #64]
+; MEMMOVE-LIMIT-7-NEXT:    stp q0, q0, [x0, #96]
+; MEMMOVE-LIMIT-7-NEXT:    ret
+;
+; MEMMOVE-LIMIT-8-LABEL: memset_128_optsize:
+; MEMMOVE-LIMIT-8:       // %bb.0:
+; MEMMOVE-LIMIT-8-NEXT:    movi v0.2d, #0000000000000000
+; MEMMOVE-LIMIT-8-NEXT:    stp q0, q0, [x0]
+; MEMMOVE-LIMIT-8-NEXT:    stp q0, q0, [x0, #32]
+; MEMMOVE-LIMIT-8-NEXT:    stp q0, q0, [x0, #64]
+; MEMMOVE-LIMIT-8-NEXT:    stp q0, q0, [x0, #96]
+; MEMMOVE-LIMIT-8-NEXT:    ret
+  call void @llvm.memset.p0.i64(ptr align 16 %dst, i8 0, i64 128, i1 false)
+  ret void
+}
+
+; memcpy tests
+
+; Test memcpy with 128 bytes (8 stores of 16 bytes each)
+; This should be inlined by default and with -max-store-memcpy=8
+; but should call memcpy library with -max-store-memcpy=7
+define void @memcpy_128(ptr %dst, ptr %src) {
+; DEFAULT-LABEL: memcpy_128:
+; DEFAULT:       // %bb.0:
+; DEFAULT-NEXT:    ldp q1, q0, [x1, #32]
+; DEFAULT-NEXT:    ldp q2, q3, [x1]
+; DEFAULT-NEXT:    stp q1, q0, [x0, #32]
+; DEFAULT-NEXT:    stp q2, q3, [x0]
+; DEFAULT-NEXT:    ldp q1, q0, [x1, #96]
+; DEFAULT-NEXT:    ldp q2, q3, [x1, #64]
+; DEFAULT-NEXT:    stp q1, q0, [x0, #96]
+; DEFAULT-NEXT:    stp q2, q3, [x0, #64]
+; DEFAULT-NEXT:    ret
+;
+; MEMSET-LIMIT-7-LABEL: memcpy_128:
+; MEMSET-LIMIT-7:       // %bb.0:
+; MEMSET-LIMIT-7-NEXT:    ldp q1, q0, [x1, #32]
+; MEMSET-LIMIT-7-NEXT:    ldp q2, q3, [x1]
+; MEMSET-LIMIT-7-NEXT:    stp q1, q0, [x0, #32]
+; MEMSET-LIMIT-7-NEXT:    stp q2, q3, [x0]
+; MEMSET-LIMIT-7-NEXT:    ldp q1, q0, [x1, #96]
+; MEMSET-LIMIT-7-NEXT:    ldp q2, q3, [x1, #64]
+; MEMSET-LIMIT-7-NEXT:    stp q1, q0, [x0, #96]
+; MEMSET-LIMIT-7-NEXT:    stp q2, q3, [x0, #64]
+; MEMSET-LIMIT-7-NEXT:    ret
+;
+; MEMSET-LIMIT-8-LABEL: memcpy_128:
+; MEMSET-LIMIT-8:       // %bb.0:
+; MEMSET-LIMIT-8-NEXT:    ldp q1, q0, [x1, #32]
+; MEMSET-LIMIT-8-NEXT:    ldp q2, q3, [x1]
+; MEMSET-LIMIT-8-NEXT:    stp q1, q0, [x0, #32]
+; MEMSET-LIMIT-8-NEXT:    stp q2, q3, [x0]
+; MEMSET-LIMIT-8-NEXT:    ldp q1, q0, [x1, #96]
+; MEMSET-LIMIT-8-NEXT:    ldp q2, q3, [x1, #64]
+; MEMSET-LIMIT-8-NEXT:    stp q1, q0, [x0, #96]
+; MEMSET-LIMIT-8-NEXT:    stp q2, q3, [x0, #64]
+; MEMSET-LIMIT-8-NEXT:    ret
+;
+; MEMCPY-LIMIT-7-LABEL: memcpy_128:
+; MEMCPY-LIMIT-7:       // %bb.0:
+; MEMCPY-LIMIT-7-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMCPY-LIMIT-7-NEXT:    .cfi_def_cfa_offset 16
+; MEMCPY-LIMIT-7-NEXT:    .cfi_offset w30, -16
+; MEMCPY-LIMIT-7-NEXT:    mov w2, #128 // =0x80
+; MEMCPY-LIMIT-7-NEXT:    bl memcpy
+; MEMCPY-LIMIT-7-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMCPY-LIMIT-7-NEXT:    ret
+;
+; MEMCPY-LIMIT-8-LABEL: memcpy_128:
+; MEMCPY-LIMIT-8:       // %bb.0:
+; MEMCPY-LIMIT-8-NEXT:    ldp q1, q0, [x1, #32]
+; MEMCPY-LIMIT-8-NEXT:    ldp q2, q3, [x1]
+; MEMCPY-LIMIT-8-NEXT:    stp q1, q0, [x0, #32]
+; MEMCPY-LIMIT-8-NEXT:    stp q2, q3, [x0]
+; MEMCPY-LIMIT-8-NEXT:    ldp q1, q0, [x1, #96]
+; MEMCPY-LIMIT-8-NEXT:    ldp q2, q3, [x1, #64]
+; MEMCPY-LIMIT-8-NEXT:    stp q1, q0, [x0, #96]
+; MEMCPY-LIMIT-8-NEXT:    stp q2, q3, [x0, #64]
+; MEMCPY-LIMIT-8-NEXT:    ret
+;
+; MEMMOVE-LIMIT-7-LABEL: memcpy_128:
+; MEMMOVE-LIMIT-7:       // %bb.0:
+; MEMMOVE-LIMIT-7-NEXT:    ldp q1, q0, [x1, #32]
+; MEMMOVE-LIMIT-7-NEXT:    ldp q2, q3, [x1]
+; MEMMOVE-LIMIT-7-NEXT:    stp q1, q0, [x0, #32]
+; MEMMOVE-LIMIT-7-NEXT:    stp q2, q3, [x0]
+; MEMMOVE-LIMIT-7-NEXT:    ldp q1, q0, [x1, #96]
+; MEMMOVE-LIMIT-7-NEXT:    ldp q2, q3, [x1, #64]
+; MEMMOVE-LIMIT-7-NEXT:    stp q1, q0, [x0, #96]
+; MEMMOVE-LIMIT-7-NEXT:    stp q2, q3, [x0, #64]
+; MEMMOVE-LIMIT-7-NEXT:    ret
+;
+; MEMMOVE-LIMIT-8-LABEL: memcpy_128:
+; MEMMOVE-LIMIT-8:       // %bb.0:
+; MEMMOVE-LIMIT-8-NEXT:    ldp q1, q0, [x1, #32]
+; MEMMOVE-LIMIT-8-NEXT:    ldp q2, q3, [x1]
+; MEMMOVE-LIMIT-8-NEXT:    stp q1, q0, [x0, #32]
+; MEMMOVE-LIMIT-8-NEXT:    stp q2, q3, [x0]
+; MEMMOVE-LIMIT-8-NEXT:    ldp q1, q0, [x1, #96]
+; MEMMOVE-LIMIT-8-NEXT:    ldp q2, q3, [x1, #64]
+; MEMMOVE-LIMIT-8-NEXT:    stp q1, q0, [x0, #96]
+; MEMMOVE-LIMIT-8-NEXT:    stp q2, q3, [x0, #64]
+; MEMMOVE-LIMIT-8-NEXT:    ret
+  call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 128, i1 false)
+  ret void
+}
+
+; Test optsize memcpy with 128 bytes (8 stores of 16 bytes each)
+; This should be inlined with -max-store-memcpy=8
+; but should call memcpy library with -max-store-memcpy=7
+define void @memcpy_128_optsize(ptr %dst, ptr %src) optsize {
+; DEFAULT-LABEL: memcpy_128_optsize:
+; DEFAULT:       // %bb.0:
+; DEFAULT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; DEFAULT-NEXT:    .cfi_def_cfa_offset 16
+; DEFAULT-NEXT:    .cfi_offset w30, -16
+; DEFAULT-NEXT:    mov w2, #128 // =0x80
+; DEFAULT-NEXT:    bl memcpy
+; DEFAULT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; DEFAULT-NEXT:    ret
+;
+; MEMSET-LIMIT-7-LABEL: memcpy_128_optsize:
+; MEMSET-LIMIT-7:       // %bb.0:
+; MEMSET-LIMIT-7-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMSET-LIMIT-7-NEXT:    .cfi_def_cfa_offset 16
+; MEMSET-LIMIT-7-NEXT:    .cfi_offset w30, -16
+; MEMSET-LIMIT-7-NEXT:    mov w2, #128 // =0x80
+; MEMSET-LIMIT-7-NEXT:    bl memcpy
+; MEMSET-LIMIT-7-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMSET-LIMIT-7-NEXT:    ret
+;
+; MEMSET-LIMIT-8-LABEL: memcpy_128_optsize:
+; MEMSET-LIMIT-8:       // %bb.0:
+; MEMSET-LIMIT-8-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMSET-LIMIT-8-NEXT:    .cfi_def_cfa_offset 16
+; MEMSET-LIMIT-8-NEXT:    .cfi_offset w30, -16
+; MEMSET-LIMIT-8-NEXT:    mov w2, #128 // =0x80
+; MEMSET-LIMIT-8-NEXT:    bl memcpy
+; MEMSET-LIMIT-8-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMSET-LIMIT-8-NEXT:    ret
+;
+; MEMCPY-LIMIT-7-LABEL: memcpy_128_optsize:
+; MEMCPY-LIMIT-7:       // %bb.0:
+; MEMCPY-LIMIT-7-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMCPY-LIMIT-7-NEXT:    .cfi_def_cfa_offset 16
+; MEMCPY-LIMIT-7-NEXT:    .cfi_offset w30, -16
+; MEMCPY-LIMIT-7-NEXT:    mov w2, #128 // =0x80
+; MEMCPY-LIMIT-7-NEXT:    bl memcpy
+; MEMCPY-LIMIT-7-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMCPY-LIMIT-7-NEXT:    ret
+;
+; MEMCPY-LIMIT-8-LABEL: memcpy_128_optsize:
+; MEMCPY-LIMIT-8:       // %bb.0:
+; MEMCPY-LIMIT-8-NEXT:    ldp q1, q0, [x1, #32]
+; MEMCPY-LIMIT-8-NEXT:    ldp q2, q3, [x1]
+; MEMCPY-LIMIT-8-NEXT:    stp q1, q0, [x0, #32]
+; MEMCPY-LIMIT-8-NEXT:    stp q2, q3, [x0]
+; MEMCPY-LIMIT-8-NEXT:    ldp q1, q0, [x1, #96]
+; MEMCPY-LIMIT-8-NEXT:    ldp q2, q3, [x1, #64]
+; MEMCPY-LIMIT-8-NEXT:    stp q1, q0, [x0, #96]
+; MEMCPY-LIMIT-8-NEXT:    stp q2, q3, [x0, #64]
+; MEMCPY-LIMIT-8-NEXT:    ret
+;
+; MEMMOVE-LIMIT-7-LABEL: memcpy_128_optsize:
+; MEMMOVE-LIMIT-7:       // %bb.0:
+; MEMMOVE-LIMIT-7-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMMOVE-LIMIT-7-NEXT:    .cfi_def_cfa_offset 16
+; MEMMOVE-LIMIT-7-NEXT:    .cfi_offset w30, -16
+; MEMMOVE-LIMIT-7-NEXT:    mov w2, #128 // =0x80
+; MEMMOVE-LIMIT-7-NEXT:    bl memcpy
+; MEMMOVE-LIMIT-7-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMMOVE-LIMIT-7-NEXT:    ret
+;
+; MEMMOVE-LIMIT-8-LABEL: memcpy_128_optsize:
+; MEMMOVE-LIMIT-8:       // %bb.0:
+; MEMMOVE-LIMIT-8-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMMOVE-LIMIT-8-NEXT:    .cfi_def_cfa_offset 16
+; MEMMOVE-LIMIT-8-NEXT:    .cfi_offset w30, -16
+; MEMMOVE-LIMIT-8-NEXT:    mov w2, #128 // =0x80
+; MEMMOVE-LIMIT-8-NEXT:    bl memcpy
+; MEMMOVE-LIMIT-8-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMMOVE-LIMIT-8-NEXT:    ret
+  call void @llvm.memcpy.p0.p0.i64(ptr %dst, ptr %src, i64 128, i1 false)
+  ret void
+}
+
+; memmove tests
+
+; Test memmove with 128 bytes (8 stores of 16 bytes each)
+; This should be inlined by default and with -max-store-memmove=8
+; but should call memmove library with -max-store-memmove=7
+define void @memmove_128(ptr %dst, ptr %src) {
+; DEFAULT-LABEL: memmove_128:
+; DEFAULT:       // %bb.0:
+; DEFAULT-NEXT:    ldp q0, q1, [x1, #96]
+; DEFAULT-NEXT:    ldp q2, q3, [x1]
+; DEFAULT-NEXT:    ldp q4, q5, [x1, #32]
+; DEFAULT-NEXT:    ldp q6, q7, [x1, #64]
+; DEFAULT-NEXT:    stp q0, q1, [x0, #96]
+; DEFAULT-NEXT:    stp q2, q3, [x0]
+; DEFAULT-NEXT:    stp q4, q5, [x0, #32]
+; DEFAULT-NEXT:    stp q6, q7, [x0, #64]
+; DEFAULT-NEXT:    ret
+;
+; MEMSET-LIMIT-7-LABEL: memmove_128:
+; MEMSET-LIMIT-7:       // %bb.0:
+; MEMSET-LIMIT-7-NEXT:    ldp q0, q1, [x1, #96]
+; MEMSET-LIMIT-7-NEXT:    ldp q2, q3, [x1]
+; MEMSET-LIMIT-7-NEXT:    ldp q4, q5, [x1, #32]
+; MEMSET-LIMIT-7-NEXT:    ldp q6, q7, [x1, #64]
+; MEMSET-LIMIT-7-NEXT:    stp q0, q1, [x0, #96]
+; MEMSET-LIMIT-7-NEXT:    stp q2, q3, [x0]
+; MEMSET-LIMIT-7-NEXT:    stp q4, q5, [x0, #32]
+; MEMSET-LIMIT-7-NEXT:    stp q6, q7, [x0, #64]
+; MEMSET-LIMIT-7-NEXT:    ret
+;
+; MEMSET-LIMIT-8-LABEL: memmove_128:
+; MEMSET-LIMIT-8:       // %bb.0:
+; MEMSET-LIMIT-8-NEXT:    ldp q0, q1, [x1, #96]
+; MEMSET-LIMIT-8-NEXT:    ldp q2, q3, [x1]
+; MEMSET-LIMIT-8-NEXT:    ldp q4, q5, [x1, #32]
+; MEMSET-LIMIT-8-NEXT:    ldp q6, q7, [x1, #64]
+; MEMSET-LIMIT-8-NEXT:    stp q0, q1, [x0, #96]
+; MEMSET-LIMIT-8-NEXT:    stp q2, q3, [x0]
+; MEMSET-LIMIT-8-NEXT:    stp q4, q5, [x0, #32]
+; MEMSET-LIMIT-8-NEXT:    stp q6, q7, [x0, #64]
+; MEMSET-LIMIT-8-NEXT:    ret
+;
+; MEMCPY-LIMIT-7-LABEL: memmove_128:
+; MEMCPY-LIMIT-7:       // %bb.0:
+; MEMCPY-LIMIT-7-NEXT:    ldp q0, q1, [x1, #96]
+; MEMCPY-LIMIT-7-NEXT:    ldp q2, q3, [x1]
+; MEMCPY-LIMIT-7-NEXT:    ldp q4, q5, [x1, #32]
+; MEMCPY-LIMIT-7-NEXT:    ldp q6, q7, [x1, #64]
+; MEMCPY-LIMIT-7-NEXT:    stp q0, q1, [x0, #96]
+; MEMCPY-LIMIT-7-NEXT:    stp q2, q3, [x0]
+; MEMCPY-LIMIT-7-NEXT:    stp q4, q5, [x0, #32]
+; MEMCPY-LIMIT-7-NEXT:    stp q6, q7, [x0, #64]
+; MEMCPY-LIMIT-7-NEXT:    ret
+;
+; MEMCPY-LIMIT-8-LABEL: memmove_128:
+; MEMCPY-LIMIT-8:       // %bb.0:
+; MEMCPY-LIMIT-8-NEXT:    ldp q0, q1, [x1, #96]
+; MEMCPY-LIMIT-8-NEXT:    ldp q2, q3, [x1]
+; MEMCPY-LIMIT-8-NEXT:    ldp q4, q5, [x1, #32]
+; MEMCPY-LIMIT-8-NEXT:    ldp q6, q7, [x1, #64]
+; MEMCPY-LIMIT-8-NEXT:    stp q0, q1, [x0, #96]
+; MEMCPY-LIMIT-8-NEXT:    stp q2, q3, [x0]
+; MEMCPY-LIMIT-8-NEXT:    stp q4, q5, [x0, #32]
+; MEMCPY-LIMIT-8-NEXT:    stp q6, q7, [x0, #64]
+; MEMCPY-LIMIT-8-NEXT:    ret
+;
+; MEMMOVE-LIMIT-7-LABEL: memmove_128:
+; MEMMOVE-LIMIT-7:       // %bb.0:
+; MEMMOVE-LIMIT-7-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMMOVE-LIMIT-7-NEXT:    .cfi_def_cfa_offset 16
+; MEMMOVE-LIMIT-7-NEXT:    .cfi_offset w30, -16
+; MEMMOVE-LIMIT-7-NEXT:    mov w2, #128 // =0x80
+; MEMMOVE-LIMIT-7-NEXT:    bl memmove
+; MEMMOVE-LIMIT-7-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMMOVE-LIMIT-7-NEXT:    ret
+;
+; MEMMOVE-LIMIT-8-LABEL: memmove_128:
+; MEMMOVE-LIMIT-8:       // %bb.0:
+; MEMMOVE-LIMIT-8-NEXT:    ldp q0, q1, [x1, #96]
+; MEMMOVE-LIMIT-8-NEXT:    ldp q2, q3, [x1]
+; MEMMOVE-LIMIT-8-NEXT:    ldp q4, q5, [x1, #32]
+; MEMMOVE-LIMIT-8-NEXT:    ldp q6, q7, [x1, #64]
+; MEMMOVE-LIMIT-8-NEXT:    stp q0, q1, [x0, #96]
+; MEMMOVE-LIMIT-8-NEXT:    stp q2, q3, [x0]
+; MEMMOVE-LIMIT-8-NEXT:    stp q4, q5, [x0, #32]
+; MEMMOVE-LIMIT-8-NEXT:    stp q6, q7, [x0, #64]
+; MEMMOVE-LIMIT-8-NEXT:    ret
+  call void @llvm.memmove.p0.p0.i64(ptr %dst, ptr %src, i64 128, i1 false)
+  ret void
+}
+
+; Test optsize memmove with 128 bytes (8 stores of 16 bytes each)
+; This should be inlined with -max-store-memmove=8
+; but should call memmove library with -max-store-memmove=7
+define void @memmove_128_optsize(ptr %dst, ptr %src) optsize {
+; DEFAULT-LABEL: memmove_128_optsize:
+; DEFAULT:       // %bb.0:
+; DEFAULT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; DEFAULT-NEXT:    .cfi_def_cfa_offset 16
+; DEFAULT-NEXT:    .cfi_offset w30, -16
+; DEFAULT-NEXT:    mov w2, #128 // =0x80
+; DEFAULT-NEXT:    bl memmove
+; DEFAULT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; DEFAULT-NEXT:    ret
+;
+; MEMSET-LIMIT-7-LABEL: memmove_128_optsize:
+; MEMSET-LIMIT-7:       // %bb.0:
+; MEMSET-LIMIT-7-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMSET-LIMIT-7-NEXT:    .cfi_def_cfa_offset 16
+; MEMSET-LIMIT-7-NEXT:    .cfi_offset w30, -16
+; MEMSET-LIMIT-7-NEXT:    mov w2, #128 // =0x80
+; MEMSET-LIMIT-7-NEXT:    bl memmove
+; MEMSET-LIMIT-7-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMSET-LIMIT-7-NEXT:    ret
+;
+; MEMSET-LIMIT-8-LABEL: memmove_128_optsize:
+; MEMSET-LIMIT-8:       // %bb.0:
+; MEMSET-LIMIT-8-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMSET-LIMIT-8-NEXT:    .cfi_def_cfa_offset 16
+; MEMSET-LIMIT-8-NEXT:    .cfi_offset w30, -16
+; MEMSET-LIMIT-8-NEXT:    mov w2, #128 // =0x80
+; MEMSET-LIMIT-8-NEXT:    bl memmove
+; MEMSET-LIMIT-8-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMSET-LIMIT-8-NEXT:    ret
+;
+; MEMCPY-LIMIT-7-LABEL: memmove_128_optsize:
+; MEMCPY-LIMIT-7:       // %bb.0:
+; MEMCPY-LIMIT-7-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMCPY-LIMIT-7-NEXT:    .cfi_def_cfa_offset 16
+; MEMCPY-LIMIT-7-NEXT:    .cfi_offset w30, -16
+; MEMCPY-LIMIT-7-NEXT:    mov w2, #128 // =0x80
+; MEMCPY-LIMIT-7-NEXT:    bl memmove
+; MEMCPY-LIMIT-7-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMCPY-LIMIT-7-NEXT:    ret
+;
+; MEMCPY-LIMIT-8-LABEL: memmove_128_optsize:
+; MEMCPY-LIMIT-8:       // %bb.0:
+; MEMCPY-LIMIT-8-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMCPY-LIMIT-8-NEXT:    .cfi_def_cfa_offset 16
+; MEMCPY-LIMIT-8-NEXT:    .cfi_offset w30, -16
+; MEMCPY-LIMIT-8-NEXT:    mov w2, #128 // =0x80
+; MEMCPY-LIMIT-8-NEXT:    bl memmove
+; MEMCPY-LIMIT-8-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMCPY-LIMIT-8-NEXT:    ret
+;
+; MEMMOVE-LIMIT-7-LABEL: memmove_128_optsize:
+; MEMMOVE-LIMIT-7:       // %bb.0:
+; MEMMOVE-LIMIT-7-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; MEMMOVE-LIMIT-7-NEXT:    .cfi_def_cfa_offset 16
+; MEMMOVE-LIMIT-7-NEXT:    .cfi_offset w30, -16
+; MEMMOVE-LIMIT-7-NEXT:    mov w2, #128 // =0x80
+; MEMMOVE-LIMIT-7-NEXT:    bl memmove
+; MEMMOVE-LIMIT-7-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; MEMMOVE-LIMIT-7-NEXT:    ret
+;
+; MEMMOVE-LIMIT-8-LABEL: memmove_128_optsize:
+; MEMMOVE-LIMIT-8:       // %bb.0:
+; MEMMOVE-LIMIT-8-NEXT:    ldp q0, q1, [x1, #96]
+; MEMMOVE-LIMIT-8-NEXT:    ldp q2, q3, [x1]
+; MEMMOVE-LIMIT-8-NEXT:    ldp q4, q5, [x1, #32]
+; MEMMOVE-LIMIT-8-NEXT:    ldp q6, q7, [x1, #64]
+; MEMMOVE-LIMIT-8-NEXT:    stp q0, q1, [x0, #96]
+; MEMMOVE-LIMIT-8-NEXT:    stp q2, q3, [x0]
+; MEMMOVE-LIMIT-8-NEXT:    stp q4, q5, [x0, #32]
+; MEMMOVE-LIMIT-8-NEXT:    stp q6, q7, [x0, #64]
+; MEMMOVE-LIMIT-8-NEXT:    ret
+  call void @llvm.memmove.p0.p0.i64(ptr %dst, ptr %src, i64 128, i1 false)
+  ret void
+}


        


More information about the llvm-commits mailing list