[libc-commits] [libc] b5f04d8 - [libc] Use different alignment for memcpy between ARM and x86.

Guillaume Chatelet via libc-commits libc-commits at lists.llvm.org
Mon Apr 26 12:30:29 PDT 2021


Author: Guillaume Chatelet
Date: 2021-04-26T19:30:00Z
New Revision: b5f04d81a2c0b01db32ed7c1685be37e6bdce171

URL: https://github.com/llvm/llvm-project/commit/b5f04d81a2c0b01db32ed7c1685be37e6bdce171
DIFF: https://github.com/llvm/llvm-project/commit/b5f04d81a2c0b01db32ed7c1685be37e6bdce171.diff

LOG: [libc] Use different alignment for memcpy between ARM and x86.

Aligned copy used to be 'destination aligned' for x86 but this decision was reverted in D93457 where we noticed that it was better for ARM to be 'source aligned'.
More benchmarking confirmed that it can be up to 30% faster to align copy to destination for x86. This Patch offers both implementations and switches x86 back to destination aligned.
It also fixes alignment to 32 byte on x86.

Differential Revision: https://reviews.llvm.org/D101296

Added: 
    

Modified: 
    libc/src/string/aarch64/memcpy.cpp
    libc/src/string/memcpy.cpp
    libc/src/string/memory_utils/memcpy_utils.h
    libc/src/string/x86/memcpy.cpp
    libc/test/src/string/memory_utils/memcpy_utils_test.cpp

Removed: 
    


################################################################################
diff  --git a/libc/src/string/aarch64/memcpy.cpp b/libc/src/string/aarch64/memcpy.cpp
index 63ed5fd48c710..78988ec37afe6 100644
--- a/libc/src/string/aarch64/memcpy.cpp
+++ b/libc/src/string/aarch64/memcpy.cpp
@@ -54,7 +54,7 @@ static void memcpy_aarch64(char *__restrict dst, const char *__restrict src,
     return CopyBlockOverlap<32>(dst, src, count);
   if (count < 128)
     return CopyBlockOverlap<64>(dst, src, count);
-  return CopyAlignedBlocks<64, 16>(dst, src, count);
+  return CopySrcAlignedBlocks<64, 16>(dst, src, count);
 }
 
 LLVM_LIBC_FUNCTION(void *, memcpy,

diff  --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp
index a145b904b87d2..e050d7f4ffa2d 100644
--- a/libc/src/string/memcpy.cpp
+++ b/libc/src/string/memcpy.cpp
@@ -52,7 +52,7 @@ static void memcpy_impl(char *__restrict dst, const char *__restrict src,
     return CopyBlockOverlap<32>(dst, src, count);
   if (count < 128)
     return CopyBlockOverlap<64>(dst, src, count);
-  return CopyAlignedBlocks<32>(dst, src, count);
+  return CopySrcAlignedBlocks<32>(dst, src, count);
 }
 
 LLVM_LIBC_FUNCTION(void *, memcpy,

diff  --git a/libc/src/string/memory_utils/memcpy_utils.h b/libc/src/string/memory_utils/memcpy_utils.h
index 8fb04915c37b5..23836bbde3427 100644
--- a/libc/src/string/memory_utils/memcpy_utils.h
+++ b/libc/src/string/memory_utils/memcpy_utils.h
@@ -98,8 +98,8 @@ static void CopyBlockOverlap(char *__restrict dst, const char *__restrict src,
 //               `count > 2 * kBlockSize` for efficiency.
 //               `count >= kAlignment` for correctness.
 template <size_t kBlockSize, size_t kAlignment = kBlockSize>
-static void CopyAlignedBlocks(char *__restrict dst, const char *__restrict src,
-                              size_t count) {
+static void CopySrcAlignedBlocks(char *__restrict dst,
+                                 const char *__restrict src, size_t count) {
   static_assert(is_power2(kAlignment), "kAlignment must be a power of two");
   static_assert(is_power2(kBlockSize), "kBlockSize must be a power of two");
   static_assert(kAlignment <= kBlockSize,
@@ -116,6 +116,25 @@ static void CopyAlignedBlocks(char *__restrict dst, const char *__restrict src,
   CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block
 }
 
+template <size_t kBlockSize, size_t kAlignment = kBlockSize>
+static void CopyDstAlignedBlocks(char *__restrict dst,
+                                 const char *__restrict src, size_t count) {
+  static_assert(is_power2(kAlignment), "kAlignment must be a power of two");
+  static_assert(is_power2(kBlockSize), "kBlockSize must be a power of two");
+  static_assert(kAlignment <= kBlockSize,
+                "kAlignment must be less or equal to block size");
+  CopyBlock<kAlignment>(dst, src); // Copy first block
+
+  // Copy aligned blocks
+  const size_t ofla = offset_from_last_aligned<kAlignment>(dst);
+  const size_t limit = count + ofla - kBlockSize;
+  for (size_t offset = kAlignment; offset < limit; offset += kBlockSize)
+    CopyBlock<kBlockSize>(assume_aligned<kAlignment>(dst - ofla + offset),
+                          src - ofla + offset);
+
+  CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block
+}
+
 } // namespace __llvm_libc
 
 #endif //  LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_UTILS_H

diff  --git a/libc/src/string/x86/memcpy.cpp b/libc/src/string/x86/memcpy.cpp
index b9163d978befd..bbd8fe9ebf7a5 100644
--- a/libc/src/string/x86/memcpy.cpp
+++ b/libc/src/string/x86/memcpy.cpp
@@ -87,7 +87,7 @@ static void memcpy_x86(char *__restrict dst, const char *__restrict src,
   if (kHasAvx && count < 256)
     return CopyBlockOverlap<128>(dst, src, count);
   if (count <= kRepMovsBSize)
-    return CopyAlignedBlocks<kLoopCopyBlockSize>(dst, src, count);
+    return CopyDstAlignedBlocks<kLoopCopyBlockSize, 32>(dst, src, count);
   return CopyRepMovsb(dst, src, count);
 }
 

diff  --git a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
index be749c1b150b8..37529ae3d8ada 100644
--- a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
+++ b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
@@ -160,12 +160,12 @@ TEST(LlvmLibcMemcpyUtilsTest, CopyBlockOverlap) {
   EXPECT_STREQ(trace.Read(), "01112111");
 }
 
-TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocks) {
+TEST(LlvmLibcMemcpyUtilsTest, CopySrcAlignedBlocks) {
   auto &trace = GetTrace();
   // Source is aligned and multiple of alignment.
   //   "1111"
   trace.Clear();
-  CopyAlignedBlocks<4>(I(0), I(0), 4);
+  CopySrcAlignedBlocks<4>(I(0), I(0), 4);
   EXPECT_STREQ(trace.Write(), "2222");
   EXPECT_STREQ(trace.Read(), "2222");
 
@@ -174,7 +174,7 @@ TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocks) {
   // + "00001111"
   // = "11111111"
   trace.Clear();
-  CopyAlignedBlocks<4>(I(0), I(0), 8);
+  CopySrcAlignedBlocks<4>(I(0), I(0), 8);
   EXPECT_STREQ(trace.Write(), "11111111");
   EXPECT_STREQ(trace.Read(), "11111111");
 
@@ -185,7 +185,7 @@ TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocks) {
   // + "0000000001111"
   // = "1111111112221"
   trace.Clear();
-  CopyAlignedBlocks<4>(I(0), I(0), 13);
+  CopySrcAlignedBlocks<4>(I(0), I(0), 13);
   EXPECT_STREQ(trace.Write(), "1111111112221");
   EXPECT_STREQ(trace.Read(), "1111111112221");
 
@@ -196,7 +196,7 @@ TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocks) {
   // + "00000000001111"
   // = "01112111112211"
   trace.Clear();
-  CopyAlignedBlocks<4>(I(0), I(1), 13);
+  CopySrcAlignedBlocks<4>(I(0), I(1), 13);
   EXPECT_STREQ(trace.Write(), "1112111112211");
   EXPECT_STREQ(trace.Read(), "01112111112211");
 
@@ -206,24 +206,89 @@ TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocks) {
   // + "000000001111"
   // = "011121111111"
   trace.Clear();
-  CopyAlignedBlocks<4>(I(0), I(1), 11);
+  CopySrcAlignedBlocks<4>(I(0), I(1), 11);
   EXPECT_STREQ(trace.Write(), "11121111111");
   EXPECT_STREQ(trace.Read(), "011121111111");
 }
 
+TEST(LlvmLibcMemcpyUtilsTest, CopyDstAlignedBlocks) {
+  auto &trace = GetTrace();
+  // Destination is aligned and multiple of alignment.
+  //   "1111"
+  trace.Clear();
+  CopyDstAlignedBlocks<4>(I(0), I(0), 4);
+  EXPECT_STREQ(trace.Write(), "2222");
+  EXPECT_STREQ(trace.Read(), "2222");
+
+  // Destination is aligned and multiple of alignment.
+  //   "11110000"
+  // + "00001111"
+  // = "11111111"
+  trace.Clear();
+  CopyDstAlignedBlocks<4>(I(0), I(0), 8);
+  EXPECT_STREQ(trace.Write(), "11111111");
+  EXPECT_STREQ(trace.Read(), "11111111");
+
+  // Destination is aligned already overlap at end.
+  //   "1111000000000"
+  // + "0000111100000"
+  // + "0000000011110"
+  // + "0000000001111"
+  // = "1111111112221"
+  trace.Clear();
+  CopyDstAlignedBlocks<4>(I(0), I(0), 13);
+  EXPECT_STREQ(trace.Write(), "1111111112221");
+  EXPECT_STREQ(trace.Read(), "1111111112221");
+
+  // Misaligned destination.
+  //   "01111000000000"
+  // + "00001111000000"
+  // + "00000000111100"
+  // + "00000000001111"
+  // = "01112111112211"
+  trace.Clear();
+  CopyDstAlignedBlocks<4>(I(1), I(0), 13);
+  EXPECT_STREQ(trace.Write(), "01112111112211");
+  EXPECT_STREQ(trace.Read(), "1112111112211");
+
+  // Misaligned destination aligned at end.
+  //   "011110000000"
+  // + "000011110000"
+  // + "000000001111"
+  // = "011121111111"
+  trace.Clear();
+  CopyDstAlignedBlocks<4>(I(1), I(0), 11);
+  EXPECT_STREQ(trace.Write(), "011121111111");
+  EXPECT_STREQ(trace.Read(), "11121111111");
+}
+
 TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksWithAlignment) {
   auto &trace = GetTrace();
   // Source is aligned and multiple of alignment.
   //   "11111111"
   trace.Clear();
-  CopyAlignedBlocks<8, 4>(I(0), I(0), 8);
+  CopySrcAlignedBlocks<8, 4>(I(0), I(0), 8);
+  EXPECT_STREQ(trace.Write(), "22221111");
+  EXPECT_STREQ(trace.Read(), "22221111");
+
+  // Destination is aligned and multiple of alignment.
+  //   "11111111"
+  trace.Clear();
+  CopyDstAlignedBlocks<8, 4>(I(0), I(0), 8);
   EXPECT_STREQ(trace.Write(), "22221111");
   EXPECT_STREQ(trace.Read(), "22221111");
 
   // Source is aligned and multiple of alignment.
   //   "111111111"
   trace.Clear();
-  CopyAlignedBlocks<8, 4>(I(0), I(0), 9);
+  CopySrcAlignedBlocks<8, 4>(I(0), I(0), 9);
+  EXPECT_STREQ(trace.Write(), "122211111");
+  EXPECT_STREQ(trace.Read(), "122211111");
+
+  // Destination is aligned and multiple of alignment.
+  //   "111111111"
+  trace.Clear();
+  CopyDstAlignedBlocks<8, 4>(I(0), I(0), 9);
   EXPECT_STREQ(trace.Write(), "122211111");
   EXPECT_STREQ(trace.Read(), "122211111");
 }
@@ -234,7 +299,7 @@ TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksMaxReloads) {
     for (size_t count = 64; count < 768; ++count) {
       trace.Clear();
       // We should never reload more than twice when copying from count = 2x32.
-      CopyAlignedBlocks<32>(I(alignment), I(0), count);
+      CopySrcAlignedBlocks<32>(I(alignment), I(0), count);
       const char *const written = trace.Write();
       // First bytes are untouched.
       for (size_t i = 0; i < alignment; ++i)
@@ -254,7 +319,7 @@ TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksWithAlignmentMaxReloads) {
     for (size_t count = 64; count < 768; ++count) {
       trace.Clear();
       // We should never reload more than twice when copying from count = 2x32.
-      CopyAlignedBlocks<32, 16>(I(alignment), I(0), count);
+      CopySrcAlignedBlocks<32, 16>(I(alignment), I(0), count);
       const char *const written = trace.Write();
       // First bytes are untouched.
       for (size_t i = 0; i < alignment; ++i)


        


More information about the libc-commits mailing list