[compiler-rt] baa7f58 - tsan: make obtaining current PC faster

Dmitry Vyukov via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 19 04:05:36 PDT 2021


Author: Dmitry Vyukov
Date: 2021-07-19T13:05:30+02:00
New Revision: baa7f58973d47e99c663860e4c2c3d55505f2bc7

URL: https://github.com/llvm/llvm-project/commit/baa7f58973d47e99c663860e4c2c3d55505f2bc7
DIFF: https://github.com/llvm/llvm-project/commit/baa7f58973d47e99c663860e4c2c3d55505f2bc7.diff

LOG: tsan: make obtaining current PC faster

We obtain the current PC is all interceptors and collectively
common interceptor code contributes to overall slowdown
(in particular cheaper str/mem* functions).

The current way to obtain the current PC involves:

  4493e1:       e8 3a f3 fe ff          callq  438720 <_ZN11__sanitizer10StackTrace12GetCurrentPcEv>
  4493e9:       48 89 c6                mov    %rax,%rsi

and the called function is:

uptr StackTrace::GetCurrentPc() {
  438720:       48 8b 04 24             mov    (%rsp),%rax
  438724:       c3                      retq

The new way uses address of a local label and involves just:

  44a888:       48 8d 35 fa ff ff ff    lea    -0x6(%rip),%rsi

I am not switching all uses of StackTrace::GetCurrentPc to GET_CURRENT_PC
because it may lead some differences in produced reports and break tests.
The difference comes from the fact that currently we have PC pointing
to the CALL instruction, but the new way does not yield any code on its own
so the PC points to a random instruction in the function and symbolizing
that instruction can produce additional inlined frames (if the random
instruction happen to relate to some inlined function).

Reviewed By: vitalybuka, melver

Differential Revision: https://reviews.llvm.org/D106046

Added: 
    

Modified: 
    compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h
    compiler-rt/lib/sanitizer_common/tests/sanitizer_stacktrace_test.cpp
    compiler-rt/lib/tsan/rtl/tsan_interceptors.h

Removed: 
    


################################################################################
diff  --git a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h
index 6b2a687a990ca..ea330f36f7d79 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.h
@@ -196,5 +196,26 @@ static inline bool IsValidFrame(uptr frame, uptr stack_top, uptr stack_bottom) {
   uptr local_stack;                           \
   uptr sp = (uptr)&local_stack
 
+// GET_CURRENT_PC() is equivalent to StackTrace::GetCurrentPc().
+// Optimized x86 version is faster than GetCurrentPc because
+// it does not involve a function call, instead it reads RIP register.
+// Reads of RIP by an instruction return RIP pointing to the next
+// instruction, which is exactly what we want here, thus 0 offset.
+// It needs to be a macro because otherwise we will get the name
+// of this function on the top of most stacks. Attribute artificial
+// does not do what it claims to do, unfortunatley. And attribute
+// __nodebug__ is clang-only. If we would have an attribute that
+// would remove this function from debug info, we could simply make
+// StackTrace::GetCurrentPc() faster.
+#if defined(__x86_64__)
+#  define GET_CURRENT_PC()                \
+    ({                                    \
+      uptr pc;                            \
+      asm("lea 0(%%rip), %0" : "=r"(pc)); \
+      pc;                                 \
+    })
+#else
+#  define GET_CURRENT_PC() StackTrace::GetCurrentPc()
+#endif
 
 #endif  // SANITIZER_STACKTRACE_H

diff  --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stacktrace_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stacktrace_test.cpp
index 30cc197bb2429..72023ea432da2 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_stacktrace_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_stacktrace_test.cpp
@@ -271,6 +271,30 @@ TEST(SlowUnwindTest, ShortStackTrace) {
   EXPECT_EQ(bp, stack.top_frame_bp);
 }
 
+TEST(GetCurrentPc, Basic) {
+  // Test that PCs obtained via GET_CURRENT_PC()
+  // and StackTrace::GetCurrentPc() are all 
diff erent
+  // and are close to the function start.
+  struct Local {
+    static NOINLINE void Test() {
+      const uptr pcs[] = {
+          (uptr)&Local::Test,
+          GET_CURRENT_PC(),
+          StackTrace::GetCurrentPc(),
+          StackTrace::GetCurrentPc(),
+      };
+      for (uptr i = 0; i < ARRAY_SIZE(pcs); i++)
+        Printf("pc%zu: %p\n", i, pcs[i]);
+      for (uptr i = 1; i < ARRAY_SIZE(pcs); i++) {
+        EXPECT_GT(pcs[i], pcs[0]);
+        EXPECT_LT(pcs[i], pcs[0] + 1000);
+        for (uptr j = 0; j < i; j++) EXPECT_NE(pcs[i], pcs[j]);
+      }
+    }
+  };
+  Local::Test();
+}
+
 // Dummy implementation. This should never be called, but is required to link
 // non-optimized builds of this test.
 void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context,

diff  --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors.h b/compiler-rt/lib/tsan/rtl/tsan_interceptors.h
index 29576ea2d49ad..c5716f53a323a 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interceptors.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors.h
@@ -30,14 +30,14 @@ inline bool in_symbolizer() {
 
 }  // namespace __tsan
 
-#define SCOPED_INTERCEPTOR_RAW(func, ...) \
-    cur_thread_init(); \
-    ThreadState *thr = cur_thread(); \
-    const uptr caller_pc = GET_CALLER_PC(); \
-    ScopedInterceptor si(thr, #func, caller_pc); \
-    const uptr pc = StackTrace::GetCurrentPc(); \
-    (void)pc; \
-/**/
+#define SCOPED_INTERCEPTOR_RAW(func, ...)      \
+  cur_thread_init();                           \
+  ThreadState *thr = cur_thread();             \
+  const uptr caller_pc = GET_CALLER_PC();      \
+  ScopedInterceptor si(thr, #func, caller_pc); \
+  const uptr pc = GET_CURRENT_PC();            \
+  (void)pc;                                    \
+  /**/
 
 #define SCOPED_TSAN_INTERCEPTOR(func, ...) \
     SCOPED_INTERCEPTOR_RAW(func, __VA_ARGS__); \


        


More information about the llvm-commits mailing list