[compiler-rt] [llvm] [RFC] [msan] make MSan up to 20x faster on AMD CPUs (PR #171993)

Azat Khuzhin via llvm-commits llvm-commits at lists.llvm.org
Sun Dec 14 13:26:47 PST 2025


https://github.com/azat updated https://github.com/llvm/llvm-project/pull/171993

>From 4797f3367ab595f4493fc1b2f8a87002ad9a3f94 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail at gmail.com>
Date: Fri, 12 Dec 2025 09:32:15 +0100
Subject: [PATCH] [msan] ability to make MSan up to 20x faster on AMD CPUs
 (-DLLVM_MSAN_SHADOW_OFFSET_2MB=1)

I noticed that on AMD CPU (so far I've tested on Zen 3 and Zen 4c - AMD
EPYC 9R14) a simple program under MSan is up to 20x slower:

<details>

```c

uint64_t factorial(int n) {
    if (n <= 1) return 1;
    return n * factorial(n - 1);
}

int main() {
    const int iterations = 100000000;
    clock_t start = clock();

    for (int i = 0; i < iterations; i++) {
        volatile uint64_t result = factorial(20);
    }

    double elapsed = (double)(clock() - start) / CLOCKS_PER_SEC;
    printf("Direct loop:          %.3f seconds\n", elapsed);
    return 0;
}
```

</details>

The problem is the cache conflicts, `result` and it's address in shadow
area has conflicts, which leads to tons of cache misses:

   Performance counter stats for './factorial-test-original':

         212,850,471      L1-dcache-loads
         200,634,333      L1-dcache-load-misses            #   94.26% of all L1-dcache accesses
     <not supported>      L1-dcache-stores

         1.232666099 seconds time elapsed

         1.228437000 seconds user
         0.000994000 seconds sys

To avoid this conflicts we can add 2MiB offset for shadow area, and here
are the results - 20x improvement:

    $ /usr/bin/clang++ -fsanitize=memory -O3 factorial-test.c -o factorial-test-original
    $ ./factorial-test-original
    Direct loop:          1.223 seconds
    $ clang++ -fsanitize=memory -O3 factorial-test.c -o factorial-test-patched
    $ ./factorial-test-patched
    Direct loop:          0.060 seconds

I've tested performance on Intel CPUs (Intel(R) Xeon(R) Platinum 8124M
CPU @ 3.00GHz), and it looks the same after the patch.

Note, that to enable it you need to build llvm with `-DLLVM_MSAN_SHADOW_OFFSET_2MB=1`.

v0: add 0x40 to XOR
v1: add 2MiB offset for shadow
v2: will be enabled only by compile time switch for MSan
---
 compiler-rt/CMakeLists.txt                    |  6 ++
 compiler-rt/lib/msan/msan.h                   | 71 +++++++++++++++++++
 llvm/CMakeLists.txt                           |  2 +
 .../Transforms/Instrumentation/CMakeLists.txt |  6 ++
 .../Instrumentation/MemorySanitizer.cpp       | 16 +++--
 5 files changed, 97 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index a92258ae12446..7cea1c36b629f 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -350,6 +350,12 @@ if(NOT COMPILER_RT_HAS_FUNC_SYMBOL)
   add_definitions(-D__func__=__FUNCTION__)
 endif()
 
+if (LLVM_MSAN_SHADOW_OFFSET_2MB)
+  add_definitions(-DLLVM_MSAN_SHADOW_OFFSET_2MB=1)
+else()
+  add_definitions(-DLLVM_MSAN_SHADOW_OFFSET_2MB=0)
+endif()
+
 # Provide some common commandline flags for Sanitizer runtimes.
 if("${ANDROID_API_LEVEL}" GREATER_EQUAL 29)
   list(APPEND SANITIZER_COMMON_CFLAGS -fno-emulated-tls)
diff --git a/compiler-rt/lib/msan/msan.h b/compiler-rt/lib/msan/msan.h
index edb26997be07d..0fec89c00fe56 100644
--- a/compiler-rt/lib/msan/msan.h
+++ b/compiler-rt/lib/msan/msan.h
@@ -199,6 +199,77 @@ const MappingDesc kMemoryLayout[] = {
 #define MEM_TO_SHADOW(mem) (LINEARIZE_MEM((mem)) + 0x100000000000ULL)
 #define SHADOW_TO_ORIGIN(shadow) (((uptr)(shadow)) + 0x280000000000)
 
+#elif LLVM_MSAN_SHADOW_OFFSET_2MB == 1
+
+#if SANITIZER_NETBSD || (SANITIZER_LINUX && SANITIZER_WORDSIZE == 64)
+
+// Offset applied to shadow addresses to avoid cache line conflicts on AMD Zen
+// (on Intel it is not required, but does not harm).
+//
+// Problem: AMD Zen's 32 KiB L1D cache is 8-way associative with 64-byte lines
+// (64 sets) [1]. Addresses are partitioned as:
+//
+//   | tag | set_index (bits 6-11) | offset (bits 0-5) |
+//           ^^^^^^^^^^^^^^^^^^^^^   ^^^^^^^^^^^^^^^^^
+//              6 bits set index    6 bits index in set
+//
+// Without offset, app memory and its shadow share the same set_index (bits
+// 6-11) but have different tags. Normally, 8-way associativity accommodates
+// both lines in the same set without conflict.
+//
+// However, AMD Zen uses a Linear address μtag/way-predictor (derived from
+// addr[12:27], see [2]) to save power by predicting which way to check instead
+// of searching all 8 ways. Since XOR preserves addr[0:43], shadow and app share
+// identical μtags, causing them to predict the same way, effectively degrading
+// the 8-way cache to direct-mapped (20x slowdown).
+//
+// Solution: Adding 2MB offset (bit 21) changes the μtag while maintaining 2MB
+// page alignment for mmap.
+//
+//   [1]: https://lsferreira.net/public/knowledge-base/x86/upos/amd_zen+.PDF
+//   [2]: https://inria.hal.science/hal-02866777/document
+const unsigned long kShadowOffset = 0x200000ULL;
+// All of the following configurations are supported.
+// ASLR disabled: main executable and DSOs at 0x555550000000
+// PIE and ASLR: main executable and DSOs at 0x7f0000000000
+// non-PIE: main executable below 0x100000000, DSOs at 0x7f0000000000
+// Heap at 0x700000000000.
+const MappingDesc kMemoryLayout[] = {
+    {0x000000000000ULL, 0x010000000000ULL - kShadowOffset, MappingDesc::APP,
+     "app-1"},
+    {0x010000000000ULL - kShadowOffset, 0x010000000000ULL + kShadowOffset,
+     MappingDesc::INVALID, "gap"},
+    {0x010000000000ULL + kShadowOffset, 0x100000000000ULL + kShadowOffset,
+     MappingDesc::SHADOW, "shadow-2"},
+    {0x100000000000ULL + kShadowOffset, 0x110000000000ULL + kShadowOffset,
+     MappingDesc::INVALID, "invalid"},
+    {0x110000000000ULL + kShadowOffset, 0x200000000000ULL + kShadowOffset,
+     MappingDesc::ORIGIN, "origin-2"},
+    {0x200000000000ULL + kShadowOffset, 0x300000000000ULL + kShadowOffset,
+     MappingDesc::SHADOW, "shadow-3"},
+    {0x300000000000ULL + kShadowOffset, 0x400000000000ULL + kShadowOffset,
+     MappingDesc::ORIGIN, "origin-3"},
+    {0x400000000000ULL + kShadowOffset, 0x500000000000ULL + kShadowOffset,
+     MappingDesc::INVALID, "invalid"},
+    {0x500000000000ULL + kShadowOffset, 0x510000000000ULL, MappingDesc::SHADOW,
+     "shadow-1"},
+    {0x510000000000ULL, 0x600000000000ULL - kShadowOffset, MappingDesc::APP,
+     "app-2"},
+    {0x600000000000ULL - kShadowOffset, 0x600000000000ULL + kShadowOffset,
+     MappingDesc::INVALID, "gap"},
+    {0x600000000000ULL + kShadowOffset, 0x610000000000ULL, MappingDesc::ORIGIN,
+     "origin-1"},
+    {0x610000000000ULL, 0x700000000000ULL, MappingDesc::INVALID, "invalid"},
+    {0x700000000000ULL, 0x740000000000ULL, MappingDesc::ALLOCATOR, "allocator"},
+    {0x740000000000ULL, 0x800000000000ULL, MappingDesc::APP, "app-3"}};
+#  define MEM_TO_SHADOW(mem) \
+    ((((uptr)(mem)) ^ 0x500000000000ULL) + kShadowOffset)
+#  define SHADOW_TO_ORIGIN(mem) (((uptr)(mem)) + 0x100000000000ULL)
+
+#else
+#error LLVM_MSAN_SHADOW_OFFSET_2MB is applicable only for x86_64 NetBSD/Linux
+#endif
+
 #elif SANITIZER_NETBSD || (SANITIZER_LINUX && SANITIZER_WORDSIZE == 64)
 
 // All of the following configurations are supported.
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 6d94cbbcd2559..451de6797eeb9 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -144,6 +144,8 @@ foreach(proj ${LLVM_ENABLE_PROJECTS})
   endif()
 endforeach()
 
+option(LLVM_MSAN_SHADOW_OFFSET_2MB "Add 2MB offset to MSan shadow to avoid AMD Zen cache conflicts" OFF)
+
 # Select the runtimes to build
 #
 # As we migrate runtimes to using the bootstrapping build, the set of default runtimes
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
index 80576c61fd80c..bcd73c1d01dd5 100644
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -44,3 +44,9 @@ add_llvm_component_library(LLVMInstrumentation
   TransformUtils
   ProfileData
   )
+
+if (LLVM_MSAN_SHADOW_OFFSET_2MB)
+  add_definitions(-DLLVM_MSAN_SHADOW_OFFSET_2MB=1)
+else()
+  add_definitions(-DLLVM_MSAN_SHADOW_OFFSET_2MB=0)
+endif()
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 32ee16c89b4fe..b4ddeb0a791c4 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -443,8 +443,12 @@ static const MemoryMapParams Linux_I386_MemoryMapParams = {
 static const MemoryMapParams Linux_X86_64_MemoryMapParams = {
     0,              // AndMask (not used)
     0x500000000000, // XorMask
-    0,              // ShadowBase (not used)
-    0x100000000000, // OriginBase
+#if LLVM_MSAN_SHADOW_OFFSET_2MB == 1
+    0x200000,       // ShadowBase (== kShadowOffset)
+#else
+    0x0,            // ShadowBase (not used)
+#endif
+    0x100000200000, // OriginBase
 };
 
 // mips32 Linux
@@ -531,8 +535,12 @@ static const MemoryMapParams FreeBSD_X86_64_MemoryMapParams = {
 static const MemoryMapParams NetBSD_X86_64_MemoryMapParams = {
     0,              // AndMask
     0x500000000000, // XorMask
-    0,              // ShadowBase
-    0x100000000000, // OriginBase
+#if LLVM_MSAN_SHADOW_OFFSET_2MB == 1
+    0x200000,       // ShadowBase (== kShadowOffset)
+#else
+    0x0,            // ShadowBase (not used)
+#endif
+    0x100000200000, // OriginBase
 };
 
 static const PlatformMemoryMapParams Linux_X86_MemoryMapParams = {



More information about the llvm-commits mailing list