[compiler-rt] 7fdf270 - [dfsan] Track origin at loads

Jianzhou Zhao via llvm-commits llvm-commits at lists.llvm.org
Thu Apr 22 09:26:01 PDT 2021


Author: Jianzhou Zhao
Date: 2021-04-22T16:25:24Z
New Revision: 7fdf270965584a3b63ffed85d3c1ef20b3510668

URL: https://github.com/llvm/llvm-project/commit/7fdf270965584a3b63ffed85d3c1ef20b3510668
DIFF: https://github.com/llvm/llvm-project/commit/7fdf270965584a3b63ffed85d3c1ef20b3510668.diff

LOG: [dfsan] Track origin at loads

    The first version of origin tracking tracks only memory stores. Although
    this is sufficient for understanding correct flows, it is hard to figure
    out where an undefined value is read from. To find reading undefined values,
    we still have to do a reverse binary search from the last store in the chain
    with printing and logging at possible code paths. This is
    quite inefficient.

    Tracking memory load instructions can help this case. The main issues of
    tracking loads are performance and code size overheads.

    With tracking only stores, the code size overhead is 38%,
    memory overhead is 1x, and cpu overhead is 3x. In practice #load is much
    larger than #store, so both code size and cpu overhead increases. The
    first blocker is code size overhead: link fails if we inline tracking
    loads. The workaround is using external function calls to propagate
    metadata. This is also the workaround ASan uses. The cpu overhead
    is ~10x. This is a trade off between debuggability and performance,
    and will be used only when debugging cases that tracking only stores
    is not enough.

Reviewed By: gbalats

Differential Revision: https://reviews.llvm.org/D100967

Added: 
    compiler-rt/test/dfsan/origin_track_ld.c
    llvm/test/Instrumentation/DataFlowSanitizer/origin_track_load.ll

Modified: 
    compiler-rt/lib/dfsan/dfsan.cpp
    llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
    llvm/test/Instrumentation/DataFlowSanitizer/basic.ll

Removed: 
    


################################################################################
diff  --git a/compiler-rt/lib/dfsan/dfsan.cpp b/compiler-rt/lib/dfsan/dfsan.cpp
index 2aff8869d2cf9..e60703cc40671 100644
--- a/compiler-rt/lib/dfsan/dfsan.cpp
+++ b/compiler-rt/lib/dfsan/dfsan.cpp
@@ -559,14 +559,26 @@ static void WriteShadowIfDifferent(dfsan_label label, uptr shadow_addr,
   }
 }
 
+#define RET_CHAIN_ORIGIN(id)           \
+  GET_CALLER_PC_BP_SP;                 \
+  (void)sp;                            \
+  GET_STORE_STACK_TRACE_PC_BP(pc, bp); \
+  return ChainOrigin(id, &stack);
+
 // Return a new origin chain with the previous ID id and the current stack
 // trace.
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE dfsan_origin
 __dfsan_chain_origin(dfsan_origin id) {
-  GET_CALLER_PC_BP_SP;
-  (void)sp;
-  GET_STORE_STACK_TRACE_PC_BP(pc, bp);
-  return ChainOrigin(id, &stack);
+  RET_CHAIN_ORIGIN(id)
+}
+
+// Return a new origin chain with the previous ID id and the current stack
+// trace if the label is tainted.
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE dfsan_origin
+__dfsan_chain_origin_if_tainted(dfsan_label label, dfsan_origin id) {
+  if (!label)
+    return id;
+  RET_CHAIN_ORIGIN(id)
 }
 
 // Copy or move the origins of the len bytes from src to dst.

diff  --git a/compiler-rt/test/dfsan/origin_track_ld.c b/compiler-rt/test/dfsan/origin_track_ld.c
new file mode 100644
index 0000000000000..96edbea5381ef
--- /dev/null
+++ b/compiler-rt/test/dfsan/origin_track_ld.c
@@ -0,0 +1,31 @@
+// RUN: %clang_dfsan -gmlt -mllvm -dfsan-track-origins=2 -mllvm -dfsan-fast-16-labels=true %s -o %t && \
+// RUN:     %run %t > %t.out 2>&1
+// RUN: FileCheck %s < %t.out
+//
+// REQUIRES: x86_64-target-arch
+
+#include <sanitizer/dfsan_interface.h>
+
+__attribute__((noinline)) uint64_t foo(uint64_t a, uint64_t b) { return a + b; }
+
+int main(int argc, char *argv[]) {
+  uint64_t a = 10;
+  uint64_t b = 20;
+  dfsan_set_label(8, &a, sizeof(a));
+  uint64_t c = foo(a, b);
+  dfsan_print_origin_trace(&c, NULL);
+}
+
+// CHECK: Taint value 0x8 {{.*}} origin tracking ()
+// CHECK: Origin value: {{.*}}, Taint value was stored to memory at
+// CHECK: #0 {{.*}} in main {{.*}}origin_track_ld.c:[[@LINE-6]]
+
+// CHECK: Origin value: {{.*}}, Taint value was stored to memory at
+// CHECK: #0 {{.*}} in dfs$foo {{.*}}origin_track_ld.c:[[@LINE-15]]
+// CHECK: #1 {{.*}} in main {{.*}}origin_track_ld.c:[[@LINE-10]]
+
+// CHECK: Origin value: {{.*}}, Taint value was stored to memory at
+// CHECK: #0 {{.*}} in main {{.*}}origin_track_ld.c:[[@LINE-13]]
+
+// CHECK: Origin value: {{.*}}, Taint value was created at
+// CHECK: #0 {{.*}} in main {{.*}}origin_track_ld.c:[[@LINE-17]]

diff  --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 35315fe79fc1f..19dd419047797 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -256,7 +256,8 @@ static cl::opt<int> ClInstrumentWithCallThreshold(
 // Controls how to track origins.
 // * 0: do not track origins.
 // * 1: track origins at memory store operations.
-// * 2: TODO: track origins at memory store operations and callsites.
+// * 2: track origins at memory load and store operations.
+//      TODO: track callsites.
 static cl::opt<int> ClTrackOrigins("dfsan-track-origins",
                                    cl::desc("Track origins of labels"),
                                    cl::Hidden, cl::init(0));
@@ -453,6 +454,7 @@ class DataFlowSanitizer {
   FunctionType *DFSanLoadStoreCallbackFnTy;
   FunctionType *DFSanMemTransferCallbackFnTy;
   FunctionType *DFSanChainOriginFnTy;
+  FunctionType *DFSanChainOriginIfTaintedFnTy;
   FunctionType *DFSanMemOriginTransferFnTy;
   FunctionType *DFSanMaybeStoreOriginFnTy;
   FunctionCallee DFSanUnionFn;
@@ -469,6 +471,7 @@ class DataFlowSanitizer {
   FunctionCallee DFSanMemTransferCallbackFn;
   FunctionCallee DFSanCmpCallbackFn;
   FunctionCallee DFSanChainOriginFn;
+  FunctionCallee DFSanChainOriginIfTaintedFn;
   FunctionCallee DFSanMemOriginTransferFn;
   FunctionCallee DFSanMaybeStoreOriginFn;
   SmallPtrSet<Value *, 16> DFSanRuntimeFunctions;
@@ -637,9 +640,18 @@ struct DFSanFunction {
   Value *combineShadowsThenConvert(Type *T, Value *V1, Value *V2,
                                    Instruction *Pos);
   Value *combineOperandShadows(Instruction *Inst);
-  std::pair<Value *, Value *> loadShadowOrigin(Value *ShadowAddr, uint64_t Size,
+
+  /// Generates IR to load shadow and origin corresponding to bytes [\p
+  /// Addr, \p Addr + \p Size), where addr has alignment \p
+  /// InstAlignment, and take the union of each of those shadows. The returned
+  /// shadow always has primitive type.
+  ///
+  /// When tracking loads is enabled, the returned origin is a chain at the
+  /// current stack if the returned shadow is tainted.
+  std::pair<Value *, Value *> loadShadowOrigin(Value *Addr, uint64_t Size,
                                                Align InstAlignment,
                                                Instruction *Pos);
+
   void storePrimitiveShadowOrigin(Value *Addr, uint64_t Size,
                                   Align InstAlignment, Value *PrimitiveShadow,
                                   Value *Origin, Instruction *Pos);
@@ -695,11 +707,18 @@ struct DFSanFunction {
   /// additional call with many instructions. To ensure common cases are fast,
   /// checks if it is possible to load labels and origins without using the
   /// callback function.
+  ///
+  /// When enabling tracking load instructions, we always use
+  /// __dfsan_load_label_and_origin to reduce code size.
   bool useCallbackLoadLabelAndOrigin(uint64_t Size, Align InstAlignment);
 
   /// Returns a chain at the current stack with previous origin V.
   Value *updateOrigin(Value *V, IRBuilder<> &IRB);
 
+  /// Returns a chain at the current stack with previous origin V if Shadow is
+  /// tainted.
+  Value *updateOriginIfTainted(Value *Shadow, Value *Origin, IRBuilder<> &IRB);
+
   /// Creates an Intptr = Origin | Origin << 32 if Intptr's size is 64. Returns
   /// Origin otherwise.
   Value *originToIntptr(IRBuilder<> &IRB, Value *Origin);
@@ -722,6 +741,13 @@ struct DFSanFunction {
 
   bool shouldInstrumentWithCall();
 
+  /// Generates IR to load shadow and origin corresponding to bytes [\p
+  /// Addr, \p Addr + \p Size), where addr has alignment \p
+  /// InstAlignment, and take the union of each of those shadows. The returned
+  /// shadow always has primitive type.
+  std::pair<Value *, Value *>
+  loadShadowOriginSansLoadTracking(Value *Addr, uint64_t Size,
+                                   Align InstAlignment, Instruction *Pos);
   int NumOriginStores = 0;
 };
 
@@ -1110,6 +1136,9 @@ bool DataFlowSanitizer::init(Module &M) {
                         /*isVarArg=*/false);
   DFSanChainOriginFnTy =
       FunctionType::get(OriginTy, OriginTy, /*isVarArg=*/false);
+  Type *DFSanChainOriginIfTaintedArgs[2] = {PrimitiveShadowTy, OriginTy};
+  DFSanChainOriginIfTaintedFnTy = FunctionType::get(
+      OriginTy, DFSanChainOriginIfTaintedArgs, /*isVarArg=*/false);
   Type *DFSanMaybeStoreOriginArgs[4] = {IntegerType::get(*Ctx, ShadowWidthBits),
                                         Int8Ptr, IntptrTy, OriginTy};
   DFSanMaybeStoreOriginFnTy = FunctionType::get(
@@ -1343,6 +1372,15 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
     DFSanChainOriginFn = Mod->getOrInsertFunction("__dfsan_chain_origin",
                                                   DFSanChainOriginFnTy, AL);
   }
+  {
+    AttributeList AL;
+    AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
+    AL = AL.addParamAttribute(M.getContext(), 1, Attribute::ZExt);
+    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
+                         Attribute::ZExt);
+    DFSanChainOriginIfTaintedFn = Mod->getOrInsertFunction(
+        "__dfsan_chain_origin_if_tainted", DFSanChainOriginIfTaintedFnTy, AL);
+  }
   DFSanMemOriginTransferFn = Mod->getOrInsertFunction(
       "__dfsan_mem_origin_transfer", DFSanMemOriginTransferFnTy);
 
@@ -1381,6 +1419,8 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
       DFSanCmpCallbackFn.getCallee()->stripPointerCasts());
   DFSanRuntimeFunctions.insert(
       DFSanChainOriginFn.getCallee()->stripPointerCasts());
+  DFSanRuntimeFunctions.insert(
+      DFSanChainOriginIfTaintedFn.getCallee()->stripPointerCasts());
   DFSanRuntimeFunctions.insert(
       DFSanMemOriginTransferFn.getCallee()->stripPointerCasts());
   DFSanRuntimeFunctions.insert(
@@ -2033,6 +2073,11 @@ Align DFSanFunction::getOriginAlign(Align InstAlignment) {
 
 bool DFSanFunction::useCallbackLoadLabelAndOrigin(uint64_t Size,
                                                   Align InstAlignment) {
+  // When enabling tracking load instructions, we always use
+  // __dfsan_load_label_and_origin to reduce code size.
+  if (ClTrackOrigins == 2)
+    return true;
+
   assert(Size != 0);
   // * if Size == 1, it is sufficient to load its origin aligned at 4.
   // * if Size == 2, we assume most cases Addr % 2 == 0, so it is sufficient to
@@ -2198,13 +2243,8 @@ Value *DFSanFunction::loadLegacyShadowFast(Value *ShadowAddr, uint64_t Size,
   return Shadow;
 }
 
-// Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where
-// Addr has alignment Align, and take the union of each of those shadows. The
-// returned shadow always has primitive type.
-std::pair<Value *, Value *> DFSanFunction::loadShadowOrigin(Value *Addr,
-                                                            uint64_t Size,
-                                                            Align InstAlignment,
-                                                            Instruction *Pos) {
+std::pair<Value *, Value *> DFSanFunction::loadShadowOriginSansLoadTracking(
+    Value *Addr, uint64_t Size, Align InstAlignment, Instruction *Pos) {
   const bool ShouldTrackOrigins = DFS.shouldTrackOrigins();
 
   // Non-escaped loads.
@@ -2309,6 +2349,24 @@ std::pair<Value *, Value *> DFSanFunction::loadShadowOrigin(Value *Addr,
   return {FallbackCall, Origin};
 }
 
+std::pair<Value *, Value *> DFSanFunction::loadShadowOrigin(Value *Addr,
+                                                            uint64_t Size,
+                                                            Align InstAlignment,
+                                                            Instruction *Pos) {
+  Value *PrimitiveShadow, *Origin;
+  std::tie(PrimitiveShadow, Origin) =
+      loadShadowOriginSansLoadTracking(Addr, Size, InstAlignment, Pos);
+  if (DFS.shouldTrackOrigins()) {
+    if (ClTrackOrigins == 2) {
+      IRBuilder<> IRB(Pos);
+      auto *ConstantShadow = dyn_cast<Constant>(PrimitiveShadow);
+      if (!ConstantShadow || !ConstantShadow->isZeroValue())
+        Origin = updateOriginIfTainted(PrimitiveShadow, Origin, IRB);
+    }
+  }
+  return {PrimitiveShadow, Origin};
+}
+
 static AtomicOrdering addAcquireOrdering(AtomicOrdering AO) {
   switch (AO) {
   case AtomicOrdering::NotAtomic:
@@ -2380,6 +2438,12 @@ void DFSanVisitor::visitLoadInst(LoadInst &LI) {
   }
 }
 
+Value *DFSanFunction::updateOriginIfTainted(Value *Shadow, Value *Origin,
+                                            IRBuilder<> &IRB) {
+  assert(DFS.shouldTrackOrigins());
+  return IRB.CreateCall(DFS.DFSanChainOriginIfTaintedFn, {Shadow, Origin});
+}
+
 Value *DFSanFunction::updateOrigin(Value *V, IRBuilder<> &IRB) {
   if (!DFS.shouldTrackOrigins())
     return V;

diff  --git a/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll b/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll
index 87395a88ecb30..0fa1569617b2f 100644
--- a/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll
@@ -55,5 +55,6 @@ define void @store(i8* %p) {
 ; CHECK: declare void @__dfsan_nonzero_label()
 ; CHECK: declare void @__dfsan_vararg_wrapper(i8*)
 ; CHECK: declare zeroext i32 @__dfsan_chain_origin(i32 zeroext)
+; CHECK: declare zeroext i32 @__dfsan_chain_origin_if_tainted(i[[#SBITS]] zeroext, i32 zeroext)
 ; CHECK: declare void @__dfsan_mem_origin_transfer(i8*, i8*, i64)
 ; CHECK: declare void @__dfsan_maybe_store_origin(i[[#SBITS]] zeroext, i8*, i64, i32 zeroext)

diff  --git a/llvm/test/Instrumentation/DataFlowSanitizer/origin_track_load.ll b/llvm/test/Instrumentation/DataFlowSanitizer/origin_track_load.ll
new file mode 100644
index 0000000000000..f16a96aa76cbd
--- /dev/null
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/origin_track_load.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -dfsan -dfsan-track-origins=2 -dfsan-fast-8-labels -S | FileCheck %s
+; RUN: opt < %s -dfsan -dfsan-track-origins=2 -dfsan-fast-16-labels -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: @__dfsan_shadow_width_bits = weak_odr constant i32 [[#SBITS:]]
+; CHECK: @__dfsan_shadow_width_bytes = weak_odr constant i32 [[#SBYTES:]]
+
+define i64 @load64(i64* %p) {
+  ; CHECK-LABEL: @"dfs$load64"
+
+  ; CHECK-NEXT: %[[#PO:]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
+  ; CHECK-NEXT: %[[#PS:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]]
+
+  ; CHECK-NEXT: %[[#INTP:]] = bitcast i64* %p to i8*
+  ; CHECK-NEXT: %[[#LABEL_ORIGIN:]] = call zeroext i64 @__dfsan_load_label_and_origin(i8* %[[#INTP]], i64 8)
+  ; CHECK-NEXT: %[[#LABEL_ORIGIN_H32:]] = lshr i64 %[[#LABEL_ORIGIN]], 32
+  ; CHECK-NEXT: %[[#LABEL:]] = trunc i64 %[[#LABEL_ORIGIN_H32]] to i[[#SBITS]]
+  ; CHECK-NEXT: %[[#ORIGIN:]] = trunc i64 %[[#LABEL_ORIGIN]] to i32
+  ; CHECK-NEXT: %[[#ORIGIN_CHAINED:]] = call i32 @__dfsan_chain_origin_if_tainted(i[[#SBITS]] %[[#LABEL]], i32 %[[#ORIGIN]])
+
+  ; CHECK-NEXT: %[[#LABEL:]] = or i[[#SBITS]] %[[#LABEL]], %[[#PS]]
+  ; CHECK-NEXT: %[[#NZ:]] = icmp ne i[[#SBITS]] %[[#PS]], 0
+  ; CHECK-NEXT: %[[#ORIGIN_SEL:]] = select i1 %[[#NZ]], i32 %[[#PO]], i32 %[[#ORIGIN_CHAINED]]
+
+  ; CHECK-NEXT: %a = load i64, i64* %p
+  ; CHECK-NEXT: store i[[#SBITS]] %[[#LABEL]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; CHECK-NEXT: store i32 %[[#ORIGIN_SEL]], i32* @__dfsan_retval_origin_tls, align 4
+
+  %a = load i64, i64* %p
+  ret i64 %a
+}


        


More information about the llvm-commits mailing list