[llvm] 79b5280 - [dfsan] Enable origin tracking with fast8 mode

George Balatsouras via llvm-commits llvm-commits at lists.llvm.org
Tue Apr 20 18:11:05 PDT 2021


Author: George Balatsouras
Date: 2021-04-20T18:10:32-07:00
New Revision: 79b5280a6c136ca98eb003a493545070e83db6bf

URL: https://github.com/llvm/llvm-project/commit/79b5280a6c136ca98eb003a493545070e83db6bf
DIFF: https://github.com/llvm/llvm-project/commit/79b5280a6c136ca98eb003a493545070e83db6bf.diff

LOG: [dfsan] Enable origin tracking with fast8 mode

All related instrumentation tests have been updated.

Reviewed By: stephan.yichao.zhao

Differential Revision: https://reviews.llvm.org/D100903

Added: 
    llvm/test/Instrumentation/DataFlowSanitizer/origin_load.ll
    llvm/test/Instrumentation/DataFlowSanitizer/origin_store.ll

Modified: 
    llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
    llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll
    llvm/test/Instrumentation/DataFlowSanitizer/basic.ll
    llvm/test/Instrumentation/DataFlowSanitizer/origin_abilist.ll
    llvm/test/Instrumentation/DataFlowSanitizer/origin_cached_shadows.ll
    llvm/test/Instrumentation/DataFlowSanitizer/origin_call.ll
    llvm/test/Instrumentation/DataFlowSanitizer/origin_mem_intrinsic.ll
    llvm/test/Instrumentation/DataFlowSanitizer/origin_other_ops.ll
    llvm/test/Instrumentation/DataFlowSanitizer/origin_phi.ll
    llvm/test/Instrumentation/DataFlowSanitizer/origin_select.ll
    llvm/test/Instrumentation/DataFlowSanitizer/origin_store_threshold.ll

Removed: 
    llvm/test/Instrumentation/DataFlowSanitizer/origin_ldst.ll


################################################################################
diff  --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 63b8db7916a03..35315fe79fc1f 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -505,6 +505,10 @@ class DataFlowSanitizer {
   /// Returns whether fast8 or fast16 mode has been specified.
   bool hasFastLabelsEnabled();
 
+  /// Returns whether the given load byte size is amenable to inlined
+  /// optimization patterns.
+  bool hasLoadSizeForFastPath(uint64_t Size);
+
   /// Returns whether the pass tracks origins. Support only fast16 mode in TLS
   /// ABI mode.
   bool shouldTrackOrigins();
@@ -883,10 +887,15 @@ bool DataFlowSanitizer::hasFastLabelsEnabled() {
   return HasFastLabelsEnabled;
 }
 
+bool DataFlowSanitizer::hasLoadSizeForFastPath(uint64_t Size) {
+  uint64_t ShadowSize = Size * ShadowWidthBytes;
+  return ShadowSize % 8 == 0 || ShadowSize == 4;
+}
+
 bool DataFlowSanitizer::shouldTrackOrigins() {
   static const bool ShouldTrackOrigins =
       ClTrackOrigins && getInstrumentedABI() == DataFlowSanitizer::IA_TLS &&
-      ClFast16Labels;
+      hasFastLabelsEnabled();
   return ShouldTrackOrigins;
 }
 
@@ -2037,11 +2046,7 @@ bool DFSanFunction::useCallbackLoadLabelAndOrigin(uint64_t Size,
     return false;
 
   const Align Alignment = llvm::assumeAligned(InstAlignment.value());
-  if (Alignment >= MinOriginAlignment &&
-      Size % (64 / DFS.ShadowWidthBits) == 0)
-    return false;
-
-  return true;
+  return Alignment < MinOriginAlignment || !DFS.hasLoadSizeForFastPath(Size);
 }
 
 std::pair<Value *, Value *> DFSanFunction::loadFast16ShadowFast(
@@ -2284,8 +2289,7 @@ std::pair<Value *, Value *> DFSanFunction::loadShadowOrigin(Value *Addr,
     return {combineShadows(Load, Load1, Pos), Origin};
   }
   }
-  uint64_t ShadowSize = Size * DFS.ShadowWidthBytes;
-  bool HasSizeForFastPath = ShadowSize % 8 == 0 || ShadowSize == 4;
+  bool HasSizeForFastPath = DFS.hasLoadSizeForFastPath(Size);
   bool HasFastLabelsEnabled = DFS.hasFastLabelsEnabled();
 
   if (HasFastLabelsEnabled && HasSizeForFastPath)

diff  --git a/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll b/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll
index 0075170410fe2..31b3f5cc5efd1 100644
--- a/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll
@@ -1,5 +1,7 @@
+; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s
 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,CHECK16
-; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK,CHECK_ORIGIN
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels=true -dfsan-instrument-with-call-threshold=0 -S | FileCheck %s --check-prefixes=CHECK,CHECK_ORIGIN
 ; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,CHECK16,CHECK_ORIGIN
 ; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -dfsan-instrument-with-call-threshold=0 -S | FileCheck %s --check-prefixes=CHECK,CHECK16,CHECK_ORIGIN
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"

diff  --git a/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll b/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll
index e45a8b4fc3da7..87395a88ecb30 100644
--- a/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll
@@ -1,6 +1,6 @@
 ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,CHECK_NO_ORIGIN -DSHADOW_MASK=-123145302310913
 ; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,CHECK_ORIGIN -DSHADOW_MASK=-123145302310913
-; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels=true  -S | FileCheck %s --check-prefixes=CHECK,CHECK_NO_ORIGIN -DSHADOW_MASK=-105553116266497
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels=true  -S | FileCheck %s --check-prefixes=CHECK,CHECK_ORIGIN -DSHADOW_MASK=-105553116266497
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 

diff  --git a/llvm/test/Instrumentation/DataFlowSanitizer/origin_abilist.ll b/llvm/test/Instrumentation/DataFlowSanitizer/origin_abilist.ll
index dc6249fea5ec1..b6d8973b88343 100644
--- a/llvm/test/Instrumentation/DataFlowSanitizer/origin_abilist.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/origin_abilist.ll
@@ -1,7 +1,12 @@
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels=true  -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s
 ; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+; CHECK: @__dfsan_arg_tls = external thread_local(initialexec) global [[TLS_ARR:\[100 x i64\]]]
+; CHECK: @__dfsan_shadow_width_bits = weak_odr constant i32 [[#SBITS:]]
+; CHECK: @__dfsan_shadow_width_bytes = weak_odr constant i32 [[#SBYTES:]]
+
 define i32 @discard(i32 %a, i32 %b) {
   ret i32 0
 }
@@ -113,9 +118,9 @@ define void @call_custom_without_ret(i32 %a, i32 %b) {
   ; CHECK: @"dfs$call_custom_without_ret"
   ; CHECK: [[BO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
   ; CHECK: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK: [[BS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
-  ; CHECK: [[AS:%.*]] = load i16, i16* bitcast ([100 x i64]* @__dfsan_arg_tls to i16*), align 2
-  ; CHECK: call void @__dfso_custom_without_ret(i32 %a, i32 %b, i16 zeroext [[AS]], i16 zeroext [[BS]], i32 zeroext [[AO]], i32 zeroext [[BO]])
+  ; CHECK: [[BS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
+  ; CHECK: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align 2
+  ; CHECK: call void @__dfso_custom_without_ret(i32 %a, i32 %b, i[[#SBITS]] zeroext [[AS]], i[[#SBITS]] zeroext [[BS]], i32 zeroext [[AO]], i32 zeroext [[BO]])
   ; CHECK-NEXT: ret void
 
   call void @custom_without_ret(i32 %a, i32 %b)
@@ -127,13 +132,13 @@ define i32 @call_custom_with_ret(i32 %a, i32 %b) {
   ; CHECK: %originreturn = alloca i32, align 4
   ; CHECK: [[BO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
   ; CHECK: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK: %labelreturn = alloca i16, align 2
-  ; CHECK: [[BS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
-  ; CHECK: [[AS:%.*]] = load i16, i16* bitcast ([100 x i64]* @__dfsan_arg_tls to i16*), align 2
-  ; CHECK: {{.*}} = call i32 @__dfso_custom_with_ret(i32 %a, i32 %b, i16 zeroext [[AS]], i16 zeroext [[BS]], i16* %labelreturn, i32 zeroext [[AO]], i32 zeroext [[BO]], i32* %originreturn)
-  ; CHECK: [[RS:%.*]] = load i16, i16* %labelreturn, align 2
+  ; CHECK: %labelreturn = alloca i[[#SBITS]], align [[#SBYTES]]
+  ; CHECK: [[BS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
+  ; CHECK: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align 2
+  ; CHECK: {{.*}} = call i32 @__dfso_custom_with_ret(i32 %a, i32 %b, i[[#SBITS]] zeroext [[AS]], i[[#SBITS]] zeroext [[BS]], i[[#SBITS]]* %labelreturn, i32 zeroext [[AO]], i32 zeroext [[BO]], i32* %originreturn)
+  ; CHECK: [[RS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* %labelreturn, align [[#SBYTES]]
   ; CHECK: [[RO:%.*]] = load i32, i32* %originreturn, align 4
-  ; CHECK: store i16 [[RS]], i16* bitcast ([100 x i64]* @__dfsan_retval_tls to i16*), align 2
+  ; CHECK: store i[[#SBITS]] [[RS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align 2
   ; CHECK: store i32 [[RO]], i32* @__dfsan_retval_origin_tls, align 4
 
   %r = call i32 @custom_with_ret(i32 %a, i32 %b)
@@ -145,16 +150,16 @@ define void @call_custom_varg_without_ret(i32 %a, i32 %b) {
   ; CHECK: %originva = alloca [1 x i32], align 4
   ; CHECK: [[BO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
   ; CHECK: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK: %labelva = alloca [1 x i16], align 2
-  ; CHECK: [[BS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
-  ; CHECK: [[AS:%.*]] = load i16, i16* bitcast ([100 x i64]* @__dfsan_arg_tls to i16*), align 2
-  ; CHECK: [[VS0:%.*]] = getelementptr inbounds [1 x i16], [1 x i16]* %labelva, i32 0, i32 0
-  ; CHECK: store i16 [[AS]], i16* [[VS0]], align 2
-  ; CHECK: [[VS0:%.*]] = getelementptr inbounds [1 x i16], [1 x i16]* %labelva, i32 0, i32 0
+  ; CHECK: %labelva = alloca [1 x i[[#SBITS]]], align [[#SBYTES]]
+  ; CHECK: [[BS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
+  ; CHECK: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align 2
+  ; CHECK: [[VS0:%.*]] = getelementptr inbounds [1 x i[[#SBITS]]], [1 x i[[#SBITS]]]* %labelva, i32 0, i32 0
+  ; CHECK: store i[[#SBITS]] [[AS]], i[[#SBITS]]* [[VS0]], align [[#SBYTES]]
+  ; CHECK: [[VS0:%.*]] = getelementptr inbounds [1 x i[[#SBITS]]], [1 x i[[#SBITS]]]* %labelva, i32 0, i32 0
   ; CHECK: [[VO0:%.*]] = getelementptr inbounds [1 x i32], [1 x i32]* %originva, i32 0, i32 0
   ; CHECK: store i32 [[AO]], i32* [[VO0]], align 4
   ; CHECK: [[VO0:%.*]] = getelementptr inbounds [1 x i32], [1 x i32]* %originva, i32 0, i32 0
-  ; CHECK: call void (i32, i32, i16, i16, i16*, i32, i32, i32*, ...) @__dfso_custom_varg_without_ret(i32 %a, i32 %b, i16 zeroext [[AS]], i16 zeroext [[BS]], i16* [[VS0]], i32 zeroext [[AO]], i32 zeroext [[BO]], i32* [[VO0]], i32 %a)
+  ; CHECK: call void (i32, i32, i[[#SBITS]], i[[#SBITS]], i[[#SBITS]]*, i32, i32, i32*, ...) @__dfso_custom_varg_without_ret(i32 %a, i32 %b, i[[#SBITS]] zeroext [[AS]], i[[#SBITS]] zeroext [[BS]], i[[#SBITS]]* [[VS0]], i32 zeroext [[AO]], i32 zeroext [[BO]], i32* [[VO0]], i32 %a)
   ; CHECK-NEXT: ret void
 
   call void (i32, i32, ...) @custom_varg_without_ret(i32 %a, i32 %b, i32 %a)
@@ -167,20 +172,20 @@ define i32 @call_custom_varg_with_ret(i32 %a, i32 %b) {
   ; CHECK: %originva = alloca [1 x i32], align 4
   ; CHECK: [[BO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
   ; CHECK: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK: %labelreturn = alloca i16, align 2
-  ; CHECK: %labelva = alloca [1 x i16], align 2
-  ; CHECK: [[BS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
-  ; CHECK: [[AS:%.*]] = load i16, i16* bitcast ([100 x i64]* @__dfsan_arg_tls to i16*), align 2
-  ; CHECK: [[VS0:%.*]] = getelementptr inbounds [1 x i16], [1 x i16]* %labelva, i32 0, i32 0
-  ; CHECK: store i16 [[BS]], i16* [[VS0]], align 2
-  ; CHECK: [[VS0:%.*]] = getelementptr inbounds [1 x i16], [1 x i16]* %labelva, i32 0, i32 0
+  ; CHECK: %labelreturn = alloca i[[#SBITS]], align [[#SBYTES]]
+  ; CHECK: %labelva = alloca [1 x i[[#SBITS]]], align [[#SBYTES]]
+  ; CHECK: [[BS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
+  ; CHECK: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align 2
+  ; CHECK: [[VS0:%.*]] = getelementptr inbounds [1 x i[[#SBITS]]], [1 x i[[#SBITS]]]* %labelva, i32 0, i32 0
+  ; CHECK: store i[[#SBITS]] [[BS]], i[[#SBITS]]* [[VS0]], align [[#SBYTES]]
+  ; CHECK: [[VS0:%.*]] = getelementptr inbounds [1 x i[[#SBITS]]], [1 x i[[#SBITS]]]* %labelva, i32 0, i32 0
   ; CHECK: [[VO0:%.*]] = getelementptr inbounds [1 x i32], [1 x i32]* %originva, i32 0, i32 0
   ; CHECK: store i32 [[BO]], i32* [[VO0]], align 4
   ; CHECK: [[VO0:%.*]] = getelementptr inbounds [1 x i32], [1 x i32]* %originva, i32 0, i32 0
-  ; CHECK: {{.*}} = call i32 (i32, i32, i16, i16, i16*, i16*, i32, i32, i32*, i32*, ...) @__dfso_custom_varg_with_ret(i32 %a, i32 %b, i16 zeroext [[AS]], i16 zeroext [[BS]], i16* [[VS0]], i16* %labelreturn, i32 zeroext [[AO]], i32 zeroext [[BO]], i32* [[VO0]], i32* %originreturn, i32 %b)
-  ; CHECK: [[RS:%.*]] = load i16, i16* %labelreturn, align 2
+  ; CHECK: {{.*}} = call i32 (i32, i32, i[[#SBITS]], i[[#SBITS]], i[[#SBITS]]*, i[[#SBITS]]*, i32, i32, i32*, i32*, ...) @__dfso_custom_varg_with_ret(i32 %a, i32 %b, i[[#SBITS]] zeroext [[AS]], i[[#SBITS]] zeroext [[BS]], i[[#SBITS]]* [[VS0]], i[[#SBITS]]* %labelreturn, i32 zeroext [[AO]], i32 zeroext [[BO]], i32* [[VO0]], i32* %originreturn, i32 %b)
+  ; CHECK: [[RS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* %labelreturn, align [[#SBYTES]]
   ; CHECK: [[RO:%.*]] = load i32, i32* %originreturn, align 4
-  ; CHECK: store i16 [[RS]], i16* bitcast ([100 x i64]* @__dfsan_retval_tls to i16*), align 2
+  ; CHECK: store i[[#SBITS]] [[RS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align 2
   ; CHECK: store i32 [[RO]], i32* @__dfsan_retval_origin_tls, align 4
 
   %r = call i32 (i32, i32, ...) @custom_varg_with_ret(i32 %a, i32 %b, i32 %b)
@@ -192,13 +197,13 @@ define i32 @call_custom_cb_with_ret(i32 %a, i32 %b) {
   ; CHECK: %originreturn = alloca i32, align 4
   ; CHECK: [[BO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
   ; CHECK: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK: %labelreturn = alloca i16, align 2
-  ; CHECK: [[BS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
-  ; CHECK: [[AS:%.*]] = load i16, i16* bitcast ([100 x i64]* @__dfsan_arg_tls to i16*), align 2
-  ; CHECK: {{.*}} = call i32 @__dfso_custom_cb_with_ret(i32 (i32 (i32, i32)*, i32, i32, i16, i16, i16*, i32, i32, i32*)* @"dfst0$custom_cb_with_ret", i8* bitcast (i32 (i32, i32)* @"dfs$cb_with_ret" to i8*), i32 %a, i32 %b, i16 zeroext 0, i16 zeroext [[AS]], i16 zeroext [[BS]], i16* %labelreturn, i32 zeroext 0, i32 zeroext [[AO]], i32 zeroext [[BO]], i32* %originreturn)
-  ; CHECK: [[RS:%.*]] = load i16, i16* %labelreturn, align 2
+  ; CHECK: %labelreturn = alloca i[[#SBITS]], align [[#SBYTES]]
+  ; CHECK: [[BS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
+  ; CHECK: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align 2
+  ; CHECK: {{.*}} = call i32 @__dfso_custom_cb_with_ret(i32 (i32 (i32, i32)*, i32, i32, i[[#SBITS]], i[[#SBITS]], i[[#SBITS]]*, i32, i32, i32*)* @"dfst0$custom_cb_with_ret", i8* bitcast (i32 (i32, i32)* @"dfs$cb_with_ret" to i8*), i32 %a, i32 %b, i[[#SBITS]] zeroext 0, i[[#SBITS]] zeroext [[AS]], i[[#SBITS]] zeroext [[BS]], i[[#SBITS]]* %labelreturn, i32 zeroext 0, i32 zeroext [[AO]], i32 zeroext [[BO]], i32* %originreturn)
+  ; CHECK: [[RS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* %labelreturn, align [[#SBYTES]]
   ; CHECK: [[RO:%.*]] = load i32, i32* %originreturn, align 4
-  ; CHECK: store i16 [[RS]], i16* bitcast ([100 x i64]* @__dfsan_retval_tls to i16*), align 2
+  ; CHECK: store i[[#SBITS]] [[RS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align 2
   ; CHECK: store i32 [[RO]], i32* @__dfsan_retval_origin_tls, align 4
 
   %r = call i32 @custom_cb_with_ret(i32 (i32, i32)* @cb_with_ret, i32 %a, i32 %b)
@@ -209,9 +214,9 @@ define void @call_custom_cb_without_ret(i32 %a, i32 %b) {
   ; CHECK: @"dfs$call_custom_cb_without_ret"
   ; CHECK: [[BO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
   ; CHECK: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK: [[BS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
-  ; CHECK: [[AS:%.*]] = load i16, i16* bitcast ([100 x i64]* @__dfsan_arg_tls to i16*), align 2
-  ; CHECK: call void @__dfso_custom_cb_without_ret(void (void (i32, i32)*, i32, i32, i16, i16, i32, i32)* @"dfst0$custom_cb_without_ret", i8* bitcast (void (i32, i32)* @"dfs$cb_without_ret" to i8*), i32 %a, i32 %b, i16 zeroext 0, i16 zeroext [[AS]], i16 zeroext [[BS]], i32 zeroext 0, i32 zeroext [[AO]], i32 zeroext [[BO]])
+  ; CHECK: [[BS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
+  ; CHECK: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align 2
+  ; CHECK: call void @__dfso_custom_cb_without_ret(void (void (i32, i32)*, i32, i32, i[[#SBITS]], i[[#SBITS]], i32, i32)* @"dfst0$custom_cb_without_ret", i8* bitcast (void (i32, i32)* @"dfs$cb_without_ret" to i8*), i32 %a, i32 %b, i[[#SBITS]] zeroext 0, i[[#SBITS]] zeroext [[AS]], i[[#SBITS]] zeroext [[BS]], i32 zeroext 0, i32 zeroext [[AO]], i32 zeroext [[BO]])
   ; CHECK-NEXT: ret void
 
   call void @custom_cb_without_ret(void (i32, i32)* @cb_without_ret, i32 %a, i32 %b)
@@ -220,29 +225,29 @@ define void @call_custom_cb_without_ret(i32 %a, i32 %b) {
 
 ; CHECK: define i32 @discardg(i32 %0, i32 %1)
 ; CHECK: [[R:%.*]] = call i32 @"dfs$g"
-; CHECK-NEXT: %_dfsret = load i16, i16* bitcast ([100 x i64]* @__dfsan_retval_tls to i16*), align 2
+; CHECK-NEXT: %_dfsret = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align 2
 ; CHECK-NEXT: %_dfsret_o = load i32, i32* @__dfsan_retval_origin_tls, align 4
 ; CHECK-NEXT: ret i32 [[R]]
 
 ; CHECK: define linkonce_odr void @"dfso$custom_without_ret"(i32 %0, i32 %1)
 ; CHECK:  [[BO:%.*]]  = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
 ; CHECK-NEXT:  [[AO:%.*]]  = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-; CHECK-NEXT:  [[BS:%.*]]  = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
-; CHECK-NEXT:  [[AS:%.*]]  = load i16, i16* bitcast ([100 x i64]* @__dfsan_arg_tls to i16*), align 2
-; CHECK-NEXT:  call void @__dfso_custom_without_ret(i32 %0, i32 %1, i16 zeroext [[AS]], i16 zeroext [[BS]], i32 zeroext [[AO]], i32 zeroext [[BO]])
+; CHECK-NEXT:  [[BS:%.*]]  = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
+; CHECK-NEXT:  [[AS:%.*]]  = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align 2
+; CHECK-NEXT:  call void @__dfso_custom_without_ret(i32 %0, i32 %1, i[[#SBITS]] zeroext [[AS]], i[[#SBITS]] zeroext [[BS]], i32 zeroext [[AO]], i32 zeroext [[BO]])
 ; CHECK-NEXT:  ret void
 
 ; CHECK: define linkonce_odr i32 @"dfso$custom_with_ret"(i32 %0, i32 %1)
 ; CHECK:  %originreturn = alloca i32, align 4
 ; CHECK-NEXT:  [[BO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
 ; CHECK-NEXT:  [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-; CHECK-NEXT:  %labelreturn = alloca i16, align 2
-; CHECK-NEXT:  [[BS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
-; CHECK-NEXT:  [[AS:%.*]] = load i16, i16* bitcast ([100 x i64]* @__dfsan_arg_tls to i16*), align 2
-; CHECK-NEXT:  [[R:%.*]] = call i32 @__dfso_custom_with_ret(i32 %0, i32 %1, i16 zeroext [[AS]], i16 zeroext [[BS]], i16* %labelreturn, i32 zeroext [[AO]], i32 zeroext [[BO]], i32* %originreturn)
-; CHECK-NEXT:  [[RS:%.*]] = load i16, i16* %labelreturn, align 2
+; CHECK-NEXT:  %labelreturn = alloca i[[#SBITS]], align [[#SBYTES]]
+; CHECK-NEXT:  [[BS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
+; CHECK-NEXT:  [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align 2
+; CHECK-NEXT:  [[R:%.*]] = call i32 @__dfso_custom_with_ret(i32 %0, i32 %1, i[[#SBITS]] zeroext [[AS]], i[[#SBITS]] zeroext [[BS]], i[[#SBITS]]* %labelreturn, i32 zeroext [[AO]], i32 zeroext [[BO]], i32* %originreturn)
+; CHECK-NEXT:  [[RS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* %labelreturn, align [[#SBYTES]]
 ; CHECK-NEXT:  [[RO:%.*]] = load i32, i32* %originreturn, align 4
-; CHECK-NEXT:  store i16 [[RS]], i16* bitcast ([100 x i64]* @__dfsan_retval_tls to i16*), align 2
+; CHECK-NEXT:  store i[[#SBITS]] [[RS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align 2
 ; CHECK-NEXT:  store i32 [[RO]], i32* @__dfsan_retval_origin_tls, align 4
 ; CHECK-NEXT:  ret i32 [[R]]
 
@@ -259,15 +264,15 @@ define void @call_custom_cb_without_ret(i32 %a, i32 %b) {
 ; CHECK-NEXT:  [[BO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 2), align 4
 ; CHECK-NEXT:  [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
 ; CHECK-NEXT:  [[CO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-; CHECK-NEXT:  %labelreturn = alloca i16, align 2
-; CHECK-NEXT:  [[BS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 4) to i16*), align 2
-; CHECK-NEXT:  [[AS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
-; CHECK-NEXT:  [[CS:%.*]] = load i16, i16* bitcast ([100 x i64]* @__dfsan_arg_tls to i16*), align 2
+; CHECK-NEXT:  %labelreturn = alloca i[[#SBITS]], align [[#SBYTES]]
+; CHECK-NEXT:  [[BS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 4) to i[[#SBITS]]*), align 2
+; CHECK-NEXT:  [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
+; CHECK-NEXT:  [[CS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align 2
 ; CHECK-NEXT:  [[C:%.*]] = bitcast i32 (i32, i32)* %0 to i8*
-; CHECK-NEXT:  [[R:%.*]] = call i32 @__dfso_custom_cb_with_ret(i32 (i32 (i32, i32)*, i32, i32, i16, i16, i16*, i32, i32, i32*)* @"dfst0$custom_cb_with_ret", i8* [[C]], i32 %1, i32 %2, i16 zeroext [[CS]], i16 zeroext [[AS]], i16 zeroext [[BS]], i16* %labelreturn, i32 zeroext [[CO]], i32 zeroext [[AO]], i32 zeroext [[BO]], i32* %originreturn)
-; CHECK-NEXT:  [[RS:%.*]] = load i16, i16* %labelreturn, align 2
+; CHECK-NEXT:  [[R:%.*]] = call i32 @__dfso_custom_cb_with_ret(i32 (i32 (i32, i32)*, i32, i32, i[[#SBITS]], i[[#SBITS]], i[[#SBITS]]*, i32, i32, i32*)* @"dfst0$custom_cb_with_ret", i8* [[C]], i32 %1, i32 %2, i[[#SBITS]] zeroext [[CS]], i[[#SBITS]] zeroext [[AS]], i[[#SBITS]] zeroext [[BS]], i[[#SBITS]]* %labelreturn, i32 zeroext [[CO]], i32 zeroext [[AO]], i32 zeroext [[BO]], i32* %originreturn)
+; CHECK-NEXT:  [[RS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* %labelreturn, align [[#SBYTES]]
 ; CHECK-NEXT:  [[RO:%.*]] = load i32, i32* %originreturn, align 4
-; CHECK-NEXT:  store i16 [[RS]], i16* bitcast ([100 x i64]* @__dfsan_retval_tls to i16*), align 2
+; CHECK-NEXT:  store i[[#SBITS]] [[RS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align 2
 ; CHECK-NEXT:  store i32 [[RO]], i32* @__dfsan_retval_origin_tls, align 4
 ; CHECK-NEXT:  ret i32 [[R]]
 
@@ -275,41 +280,41 @@ define void @call_custom_cb_without_ret(i32 %a, i32 %b) {
 ; CHECK:   [[BO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 2), align 4
 ; CHECK-NEXT:  [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
 ; CHECK-NEXT:  [[CO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-; CHECK-NEXT:  [[BS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 4) to i16*), align 2
-; CHECK-NEXT:  [[AS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
-; CHECK-NEXT:  [[CS:%.*]] = load i16, i16* bitcast ([100 x i64]* @__dfsan_arg_tls to i16*), align 2
+; CHECK-NEXT:  [[BS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 4) to i[[#SBITS]]*), align 2
+; CHECK-NEXT:  [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
+; CHECK-NEXT:  [[CS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align 2
 ; CHECK-NEXT:  [[C:%.*]] = bitcast void (i32, i32)* %0 to i8*
-; CHECK-NEXT:  call void @__dfso_custom_cb_without_ret(void (void (i32, i32)*, i32, i32, i16, i16, i32, i32)* @"dfst0$custom_cb_without_ret", i8* [[C]], i32 %1, i32 %2, i16 zeroext [[CS]], i16 zeroext [[AS]], i16 zeroext [[BS]], i32 zeroext [[CO]], i32 zeroext [[AO]], i32 zeroext [[BO]])
+; CHECK-NEXT:  call void @__dfso_custom_cb_without_ret(void (void (i32, i32)*, i32, i32, i[[#SBITS]], i[[#SBITS]], i32, i32)* @"dfst0$custom_cb_without_ret", i8* [[C]], i32 %1, i32 %2, i[[#SBITS]] zeroext [[CS]], i[[#SBITS]] zeroext [[AS]], i[[#SBITS]] zeroext [[BS]], i32 zeroext [[CO]], i32 zeroext [[AO]], i32 zeroext [[BO]])
 ; CHECK-NEXT:  ret void
 
-; CHECK: declare void @__dfso_custom_without_ret(i32, i32, i16, i16, i32, i32)
+; CHECK: declare void @__dfso_custom_without_ret(i32, i32, i[[#SBITS]], i[[#SBITS]], i32, i32)
 
-; CHECK: declare i32 @__dfso_custom_with_ret(i32, i32, i16, i16, i16*, i32, i32, i32*)
+; CHECK: declare i32 @__dfso_custom_with_ret(i32, i32, i[[#SBITS]], i[[#SBITS]], i[[#SBITS]]*, i32, i32, i32*)
 
-; CHECK: declare i32 @__dfso_custom_cb_with_ret(i32 (i32 (i32, i32)*, i32, i32, i16, i16, i16*, i32, i32, i32*)*, i8*, i32, i32, i16, i16, i16, i16*, i32, i32, i32, i32*)
+; CHECK: declare i32 @__dfso_custom_cb_with_ret(i32 (i32 (i32, i32)*, i32, i32, i[[#SBITS]], i[[#SBITS]], i[[#SBITS]]*, i32, i32, i32*)*, i8*, i32, i32, i[[#SBITS]], i[[#SBITS]], i[[#SBITS]], i[[#SBITS]]*, i32, i32, i32, i32*)
 
-; CHECK: define linkonce_odr i32 @"dfst0$custom_cb_with_ret"(i32 (i32, i32)* %0, i32 %1, i32 %2, i16 %3, i16 %4, i16* %5, i32 %6, i32 %7, i32* %8)
+; CHECK: define linkonce_odr i32 @"dfst0$custom_cb_with_ret"(i32 (i32, i32)* %0, i32 %1, i32 %2, i[[#SBITS]] %3, i[[#SBITS]] %4, i[[#SBITS]]* %5, i32 %6, i32 %7, i32* %8)
 ; CHECK:   store i32 %6, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-; CHECK-NEXT:  store i16 %3, i16* bitcast ([100 x i64]* @__dfsan_arg_tls to i16*), align 2
+; CHECK-NEXT:  store i[[#SBITS]] %3, i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align 2
 ; CHECK-NEXT:  store i32 %7, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
-; CHECK-NEXT:  store i16 %4, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
+; CHECK-NEXT:  store i[[#SBITS]] %4, i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
 ; CHECK-NEXT:  %9 = call i32 %0(i32 %1, i32 %2)
-; CHECK-NEXT:  %_dfsret = load i16, i16* bitcast ([100 x i64]* @__dfsan_retval_tls to i16*), align 2
+; CHECK-NEXT:  %_dfsret = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align 2
 ; CHECK-NEXT:  %_dfsret_o = load i32, i32* @__dfsan_retval_origin_tls, align 4
-; CHECK-NEXT:  store i16 %_dfsret, i16* %5, align 2
+; CHECK-NEXT:  store i[[#SBITS]] %_dfsret, i[[#SBITS]]* %5, align [[#SBYTES]]
 ; CHECK-NEXT:  store i32 %_dfsret_o, i32* %8, align 4
 ; CHECK-NEXT:  ret i32 %9
 
-; CHECK: declare void @__dfso_custom_cb_without_ret(void (void (i32, i32)*, i32, i32, i16, i16, i32, i32)*, i8*, i32, i32, i16, i16, i16, i32, i32, i32)
+; CHECK: declare void @__dfso_custom_cb_without_ret(void (void (i32, i32)*, i32, i32, i[[#SBITS]], i[[#SBITS]], i32, i32)*, i8*, i32, i32, i[[#SBITS]], i[[#SBITS]], i[[#SBITS]], i32, i32, i32)
 
-; CHECK: define linkonce_odr void @"dfst0$custom_cb_without_ret"(void (i32, i32)* %0, i32 %1, i32 %2, i16 %3, i16 %4, i32 %5, i32 %6)
+; CHECK: define linkonce_odr void @"dfst0$custom_cb_without_ret"(void (i32, i32)* %0, i32 %1, i32 %2, i[[#SBITS]] %3, i[[#SBITS]] %4, i32 %5, i32 %6)
 ; CHECK:  store i32 %5, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-; CHECK-NEXT:  store i16 %3, i16* bitcast ([100 x i64]* @__dfsan_arg_tls to i16*), align 2
+; CHECK-NEXT:  store i[[#SBITS]] %3, i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align 2
 ; CHECK-NEXT:  store i32 %6, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
-; CHECK-NEXT:  store i16 %4, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
+; CHECK-NEXT:  store i[[#SBITS]] %4, i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
 ; CHECK-NEXT:  call void %0(i32 %1, i32 %2)
 ; CHECK-NEXT:  ret void
 
-; CHECK: declare void @__dfso_custom_varg_without_ret(i32, i32, i16, i16, i16*, i32, i32, i32*, ...)
+; CHECK: declare void @__dfso_custom_varg_without_ret(i32, i32, i[[#SBITS]], i[[#SBITS]], i[[#SBITS]]*, i32, i32, i32*, ...)
 
-; CHECK: declare i32 @__dfso_custom_varg_with_ret(i32, i32, i16, i16, i16*, i16*, i32, i32, i32*, i32*, ...)
\ No newline at end of file
+; CHECK: declare i32 @__dfso_custom_varg_with_ret(i32, i32, i[[#SBITS]], i[[#SBITS]], i[[#SBITS]]*, i[[#SBITS]]*, i32, i32, i32*, i32*, ...)
\ No newline at end of file

diff  --git a/llvm/test/Instrumentation/DataFlowSanitizer/origin_cached_shadows.ll b/llvm/test/Instrumentation/DataFlowSanitizer/origin_cached_shadows.ll
index a8cd6596ee5af..482959296fde3 100644
--- a/llvm/test/Instrumentation/DataFlowSanitizer/origin_cached_shadows.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/origin_cached_shadows.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefix=CHECK
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels=true  -S | FileCheck %s
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s
 ;
 ; %15 and %17 have the same key in shadow cache. They should not reuse the same
 ; shadow because their blocks do not dominate each other. Origin tracking
@@ -14,7 +15,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @cached_shadows(double %0) {
   ; CHECK: @"dfs$cached_shadows"
   ; CHECK:  [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK:  [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[#SBYTES]]
+  ; CHECK:  [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]]
   ; CHECK: [[L1:[0-9]+]]:
   ; CHECK:  {{.*}} = phi i[[#SBITS]]
   ; CHECK:  {{.*}} = phi i32
@@ -39,7 +40,7 @@ define void @cached_shadows(double %0) {
   ; CHECK:  [[S_L6:%.*]] = or i[[#SBITS]]
   ; CHECK:  [[AS_NE_L6:%.*]] = icmp ne i[[#SBITS]] [[AS]], 0
   ; CHECK:  [[O_L6:%.*]] = select i1 [[AS_NE_L6]], i32 [[AO]], i32 [[O_L1]]
-  ; CHECK:  [[V_L6:%.*]] = fadd double %24, %0
+  ; CHECK:  [[V_L6:%.*]] = fadd double [[V_L1]], %0
   ; CHECK:  br label %[[L7]]
   ; CHECK: [[L7]]:
   ; CHECK:  [[S_L7]] = phi i[[#SBITS]] [ [[S_L3]], %[[L3]] ], [ [[S_L1]], %[[L2]] ], [ [[S_L6]], %[[L6]] ]

diff  --git a/llvm/test/Instrumentation/DataFlowSanitizer/origin_call.ll b/llvm/test/Instrumentation/DataFlowSanitizer/origin_call.ll
index 61a12e6533972..7f511bd207586 100644
--- a/llvm/test/Instrumentation/DataFlowSanitizer/origin_call.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/origin_call.ll
@@ -1,7 +1,12 @@
-; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefix=CHECK
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels=true  -S | FileCheck %s
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+; CHECK: @__dfsan_arg_tls = external thread_local(initialexec) global [[TLS_ARR:\[100 x i64\]]]
+; CHECK: @__dfsan_shadow_width_bits = weak_odr constant i32 [[#SBITS:]]
+; CHECK: @__dfsan_shadow_width_bytes = weak_odr constant i32 [[#SBYTES:]]
+
 define i1 @arg_overflow(
 i1   %a0, i1   %a1, i1   %a2, i1   %a3, i1   %a4, i1   %a5, i1   %a6, i1   %a7, i1   %a8, i1   %a9,
 i1  %a10, i1  %a11, i1  %a12, i1  %a13, i1  %a14, i1  %a15, i1  %a16, i1  %a17, i1  %a18, i1  %a19,
@@ -36,8 +41,8 @@ i1 %a200
 define i1 @param_overflow(i1 %a) {
   ; CHECK: @"dfs$param_overflow"
   ; CHECK: store i32 %1, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 199), align 4
-  ; CHECK-NEXT: store i16 %2, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 398) to i16*), align 2
-  ; CHECK-NEXT: store i16 %2, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 400) to i16*), align 2
+  ; CHECK-NEXT: store i[[#SBITS]] %2, i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 398) to i[[#SBITS]]*), align 2
+  ; CHECK-NEXT: store i[[#SBITS]] %2, i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 400) to i[[#SBITS]]*), align 2
   ; CHECK-NEXT: %r = call i1 @"dfs$arg_overflow"
   ; CHECK: %_dfsret_o = load i32, i32* @__dfsan_retval_origin_tls, align 4
   ; CHECK: store i32 %_dfsret_o, i32* @__dfsan_retval_origin_tls, align 4
@@ -72,7 +77,7 @@ declare void @foo(i1 %a)
 
 define void @param_with_zero_shadow() {
   ; CHECK: @"dfs$param_with_zero_shadow"
-  ; CHECK-NEXT: store i16 0, i16* bitcast ([100 x i64]* @__dfsan_arg_tls to i16*), align 2
+  ; CHECK-NEXT: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align 2
   ; CHECK-NEXT: call void @"dfs$foo"(i1 true)
 
   call void @foo(i1 1)

diff  --git a/llvm/test/Instrumentation/DataFlowSanitizer/origin_ldst.ll b/llvm/test/Instrumentation/DataFlowSanitizer/origin_ldst.ll
deleted file mode 100644
index 7834bb008bc8b..0000000000000
--- a/llvm/test/Instrumentation/DataFlowSanitizer/origin_ldst.ll
+++ /dev/null
@@ -1,422 +0,0 @@
-; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK_META,CHECK
-; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -dfsan-combine-pointer-labels-on-load=false -S | FileCheck %s --check-prefixes=CHECK_META,NO_COMBINE_LOAD_PTR
-; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -dfsan-combine-pointer-labels-on-store=true -S | FileCheck %s --check-prefixes=CHECK_META,COMBINE_STORE_PTR
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; CHECK_META: @__dfsan_shadow_width_bits = weak_odr constant i32 [[#SBITS:]]
-; CHECK_META: @__dfsan_shadow_width_bytes = weak_odr constant i32 [[#SBYTES:]]
-
-define {} @load0({}* %p) {
-  ; CHECK-LABEL: @"dfs$load0"
-  ; CHECK-NEXT: %a = load {}, {}* %p, align 1
-  ; CHECK-NEXT: store {} zeroinitializer, {}* bitcast ([100 x i64]* @__dfsan_retval_tls to {}*), align [[#SBYTES]]
-  ; CHECK-NEXT: store i32 0, i32* @__dfsan_retval_origin_tls, align 4
-  ; CHECK-NEXT: ret {} %a
-
-  %a = load {}, {}* %p
-  ret {} %a
-}
-
-define i16 @load_non_escaped_alloca() {
-  ; CHECK-LABEL: @"dfs$load_non_escaped_alloca"
-  ; CHECK-NEXT: [[S_ALLOCA:%.*]] = alloca i[[#SBITS]], align [[#SBYTES]]
-  ; CHECK-NEXT: [[O_ALLOCA:%.*]] = alloca i32, align 4
-  ; CHECK: [[SHADOW:%.*]] = load i[[#SBITS]], i[[#SBITS]]* [[S_ALLOCA]], align [[#SBYTES]]
-  ; CHECK-NEXT: [[ORIGIN:%.*]] = load i32, i32* [[O_ALLOCA]], align 4
-  ; CHECK-NEXT: %a = load i16, i16* %p, align 2
-  ; CHECK-NEXT: store i[[#SBITS]] [[SHADOW]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
-  ; CHECK-NEXT: store i32 [[ORIGIN]], i32* @__dfsan_retval_origin_tls, align 4
-  
-  %p = alloca i16
-  %a = load i16, i16* %p
-  ret i16 %a
-}
-
-define i16* @load_escaped_alloca() {
-  ; CHECK-LABEL: @"dfs$load_escaped_alloca"
-  ; CHECK: [[INTP:%.*]] = ptrtoint i[[#SBITS]]* %p to i64
-  ; CHECK-NEXT: [[OFFSET:%.*]] = and i64 [[INTP]]
-  ; CHECK-NEXT: [[SHADOW_ADDR:%.*]] = mul i64 [[OFFSET]], 2
-  ; CHECK-NEXT: [[SHADOW_PTR0:%.*]] = inttoptr i64 [[SHADOW_ADDR]] to i[[#SBITS]]*
-  ; CHECK-NEXT: [[ORIGIN_OFFSET:%.*]] = add i64 [[OFFSET]], 35184372088832
-  ; CHECK-NEXT: [[ORIGIN_ADDR:%.*]] = and i64 [[ORIGIN_OFFSET]], -4
-  ; CHECK-NEXT: [[ORIGIN_PTR:%.*]] = inttoptr i64 [[ORIGIN_ADDR]] to i32*
-  ; CHECK-NEXT: {{%.*}} = load i32, i32* [[ORIGIN_PTR]], align 4
-  ; CHECK-NEXT: [[SHADOW_PTR1:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[SHADOW_PTR0]], i64 1
-  ; CHECK-NEXT: [[SHADOW0:%.*]] = load i[[#SBITS]], i[[#SBITS]]* [[SHADOW_PTR0]], align [[#SBYTES]]
-  ; CHECK-NEXT: [[SHADOW1:%.*]] = load i[[#SBITS]], i[[#SBITS]]* [[SHADOW_PTR1]], align [[#SBYTES]]
-  ; CHECK-NEXT: {{%.*}} = or i[[#SBITS]] [[SHADOW0]], [[SHADOW1]]
-  ; CHECK-NEXT: %a = load i16, i16* %p, align 2
-  ; CHECK-NEXT: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
-  ; CHECK-NEXT: store i32 0, i32* @__dfsan_retval_origin_tls, align 4
-  
-  %p = alloca i16
-  %a = load i16, i16* %p
-  ret i16* %p
-}
-
- at X = constant i1 1
-define i1 @load_global() {
-  ; CHECK-LABEL: @"dfs$load_global"
-  ; CHECK: %a = load i1, i1* @X, align 1
-  ; CHECK-NEXT: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
-  ; CHECK-NEXT: store i32 0, i32* @__dfsan_retval_origin_tls, align 4
-
-  %a = load i1, i1* @X
-  ret i1 %a
-}
-
-define i1 @load1(i1* %p) {
-  ; CHECK-LABEL: @"dfs$load1"
-  ; CHECK-NEXT: [[PO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK-NEXT: [[PS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[#SBYTES]]
-  ; CHECK-NEXT: [[INTP:%.*]] = ptrtoint {{.*}} %p to i64
-  ; CHECK-NEXT: [[OFFSET:%.*]] = and i64 [[INTP]]
-  ; CHECK-NEXT: [[SHADOW_ADDR:%.*]] = mul i64 [[OFFSET]], 2
-  ; CHECK-NEXT: [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_ADDR]] to i[[#SBITS]]*
-  ; CHECK-NEXT: [[ORIGIN_OFFSET:%.*]] = add i64 [[OFFSET]], 35184372088832
-  ; CHECK-NEXT: [[ORIGIN_ADDR:%.*]] = and i64 [[ORIGIN_OFFSET]], -4
-  ; CHECK-NEXT: [[ORIGIN_PTR:%.*]] = inttoptr i64 [[ORIGIN_ADDR]] to i32*
-  ; CHECK-NEXT: [[AO:%.*]] = load i32, i32* [[ORIGIN_PTR]], align 4
-  ; CHECK-NEXT: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* [[SHADOW_PTR]], align [[#SBYTES]]
-  ; CHECK-NEXT: [[RS:%.*]] = or i[[#SBITS]] [[AS]], [[PS]]
-  ; CHECK-NEXT: [[PS_NZ:%.*]] = icmp ne i[[#SBITS]] [[PS]], 0
-  ; CHECK-NEXT: [[RO:%.*]] = select i1 [[PS_NZ]], i32 [[PO]], i32 [[AO]]
-  ; CHECK-NEXT: %a = load i1, i1* %p, align 1
-  ; CHECK-NEXT: store i[[#SBITS]] [[RS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
-  ; CHECK-NEXT: store i32 [[RO]], i32* @__dfsan_retval_origin_tls, align 4
-
-  %a = load i1, i1* %p
-  ret i1 %a
-}
-
-define i16 @load16(i1 %i, i16* %p) {
-  ; CHECK-LABEL: @"dfs$load16"
-  ; CHECK-NEXT: [[PO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
-  ; CHECK-NEXT: [[PS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[#SBYTES]]
-  ; CHECK-NEXT: [[INTP:%.*]] = ptrtoint {{.*}} %p to i64
-  ; CHECK-NEXT: [[OFFSET:%.*]] = and i64 [[INTP]]
-  ; CHECK-NEXT: [[SHADOW_ADDR:%.*]] = mul i64 [[OFFSET]], 2
-  ; CHECK-NEXT: [[SHADOW_PTR0:%.*]] = inttoptr i64 [[SHADOW_ADDR]] to i[[#SBITS]]*
-  ; CHECK-NEXT: [[ORIGIN_OFFSET:%.*]] = add i64 [[OFFSET]], 35184372088832
-  ; CHECK-NEXT: [[ORIGIN_ADDR:%.*]] = and i64 [[ORIGIN_OFFSET]], -4
-  ; CHECK-NEXT: [[ORIGIN_PTR:%.*]] = inttoptr i64 [[ORIGIN_ADDR]] to i32*
-  ; CHECK-NEXT: [[AO:%.*]] = load i32, i32* [[ORIGIN_PTR]], align 4
-  ; CHECK-NEXT: [[SHADOW_PTR1:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[SHADOW_PTR0]], i64 1
-  ; CHECK-NEXT: [[SHADOW0:%.*]] = load i[[#SBITS]], i[[#SBITS]]* [[SHADOW_PTR0]], align [[#SBYTES]]
-  ; CHECK-NEXT: [[SHADOW1:%.*]] = load i[[#SBITS]], i[[#SBITS]]* [[SHADOW_PTR1]], align [[#SBYTES]]
-  ; CHECK-NEXT: [[AS:%.*]] = or i[[#SBITS]] [[SHADOW0]], [[SHADOW1]]
-  ; CHECK-NEXT: [[RS:%.*]] = or i[[#SBITS]] [[AS]], [[PS]]
-  ; CHECK-NEXT: [[PS_NZ:%.*]] = icmp ne i[[#SBITS]] [[PS]], 0
-  ; CHECK-NEXT: [[RO:%.*]] = select i1 [[PS_NZ]], i32 [[PO]], i32 [[AO]]
-  ; CHECK-NEXT: %a = load i16, i16* %p, align 2
-  ; CHECK-NEXT: store i[[#SBITS]] [[RS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
-  ; CHECK-NEXT: store i32 [[RO]], i32* @__dfsan_retval_origin_tls, align 4
-
-  %a = load i16, i16* %p
-  ret i16 %a
-}
-
-define i32 @load32(i32* %p) {
-  ; CHECK-LABEL: @"dfs$load32"
-
-  ; NO_COMBINE_LOAD_PTR-LABEL: @"dfs$load32"
-  ; NO_COMBINE_LOAD_PTR: [[INTP:%.*]] = ptrtoint i32* %p to i64
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[OFFSET:%.*]] = and i64 [[INTP]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_ADDR:%.*]] = mul i64 [[OFFSET]], 2
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_ADDR]] to i[[#SBITS]]*
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN_ADDR:%.*]] = add i64 [[OFFSET]], 35184372088832
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN_PTR:%.*]] = inttoptr i64 [[ORIGIN_ADDR]] to i32*
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[AO:%.*]] = load i32, i32* [[ORIGIN_PTR]], align 4
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_PTR64:%.*]] = bitcast i[[#SBITS]]* [[SHADOW_PTR]] to i64*
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW64:%.*]] = load i64, i64* [[SHADOW_PTR64]], align [[#SBYTES]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW64_H32:%.*]] = lshr i64 [[SHADOW64]], 32
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW64_HL32:%.*]] = or i64 [[SHADOW64]], [[SHADOW64_H32]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW64_HL32_H16:%.*]] = lshr i64 [[SHADOW64_HL32]], 16
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW64_HL32_HL16:%.*]] = or i64 [[SHADOW64_HL32]], [[SHADOW64_HL32_H16]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW:%.*]] = trunc i64 [[SHADOW64_HL32_HL16]] to i[[#SBITS]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: %a = load i32, i32* %p, align 4
-  ; NO_COMBINE_LOAD_PTR-NEXT: store i[[#SBITS]] [[SHADOW]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: store i32 [[AO]], i32* @__dfsan_retval_origin_tls, align 4
-
-  %a = load i32, i32* %p
-  ret i32 %a
-}
-
-define i64 @load64(i64* %p) {
-  ; CHECK-LABEL: @"dfs$load64"
-  
-  ; NO_COMBINE_LOAD_PTR-LABEL: @"dfs$load64"
-  ; NO_COMBINE_LOAD_PTR: [[INTP:%.*]] = ptrtoint i64* %p to i64
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[OFFSET:%.*]] = and i64 [[INTP]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_ADDR:%.*]] = mul i64 [[OFFSET]], 2
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_ADDR]] to i[[#SBITS]]*
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN_ADDR:%.*]] = add i64 [[OFFSET]], 35184372088832
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN_PTR_0:%.*]] = inttoptr i64 [[ORIGIN_ADDR]] to i32*
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN_0:%.*]] = load i32, i32* [[ORIGIN_PTR_0]], align 8
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_PTR_0:%.*]] = bitcast i[[#SBITS]]* [[SHADOW_PTR]] to i64*
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_0:%.*]] = load i64, i64* [[SHADOW_PTR_0]], align [[#SBYTES]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_PTR_1:%.*]] = getelementptr i64, i64* [[SHADOW_PTR_0]], i64 1
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_1:%.*]] = load i64, i64* [[SHADOW_PTR_1]], align [[#SBYTES]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW64:%.*]] = or i64 [[SHADOW_0]], [[SHADOW_1]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN_PTR_1:%.*]] = getelementptr i32, i32* [[ORIGIN_PTR_0]], i64 1
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN_1:%.*]] = load i32, i32* [[ORIGIN_PTR_1]], align 8
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW64_H32:%.*]] = lshr i64 [[SHADOW64]], 32
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW64_HL32:%.*]] = or i64 [[SHADOW64]], [[SHADOW64_H32]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW64_HL32_H16:%.*]] = lshr i64 [[SHADOW64_HL32]], 16
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW64_HL32_HL16:%.*]] = or i64 [[SHADOW64_HL32]], [[SHADOW64_HL32_H16]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW:%.*]] = trunc i64 [[SHADOW64_HL32_HL16]] to i[[#SBITS]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_1_NZ:%.*]] = icmp ne i64 [[SHADOW_1]], 0
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN:%.*]] = select i1 [[SHADOW_1_NZ]], i32 [[ORIGIN_1]], i32 [[ORIGIN_0]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: %a = load i64, i64* %p, align 8
-  ; NO_COMBINE_LOAD_PTR-NEXT: store i[[#SBITS]] [[SHADOW]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: store i32 [[ORIGIN]], i32* @__dfsan_retval_origin_tls, align 4
-
-  %a = load i64, i64* %p
-  ret i64 %a
-}
-
-define i64 @load64_align2(i64* %p) {
-  ; CHECK-LABEL: @"dfs$load64_align2"
-
-  ; NO_COMBINE_LOAD_PTR-LABEL: @"dfs$load64_align2"
-  ; NO_COMBINE_LOAD_PTR: [[INTP:%.*]] = bitcast i64* %p to i8*
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[LABEL_ORIGIN:%.*]] = call zeroext i64 @__dfsan_load_label_and_origin(i8* [[INTP]], i64 8)
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[LABEL_ORIGIN_H32:%.*]] = lshr i64 [[LABEL_ORIGIN]], 32
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[LABEL:%.*]] = trunc i64 [[LABEL_ORIGIN_H32]] to i[[#SBITS]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN:%.*]] = trunc i64 [[LABEL_ORIGIN]] to i32
-  ; NO_COMBINE_LOAD_PTR-NEXT: %a = load i64, i64* %p, align [[#SBYTES]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: store i[[#SBITS]] [[LABEL]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: store i32 [[ORIGIN]], i32* @__dfsan_retval_origin_tls, align 4
-  
-  %a = load i64, i64* %p, align 2
-  ret i64 %a
-}
-
-define i92 @load92(i92* %p) {
-  ; CHECK-LABEL: @"dfs$load92"
-
-  ; NO_COMBINE_LOAD_PTR-LABEL: @"dfs$load92"
-  ; NO_COMBINE_LOAD_PTR: [[INTP:%.*]] = ptrtoint i92* %p to i64
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[OFFSET:%.*]] = and i64 [[INTP]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_ADDR:%.*]] = mul i64 [[OFFSET]], 2
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_ADDR]] to i[[#SBITS]]*
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN_ADDR:%.*]] = add i64 [[OFFSET]], 35184372088832
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN_PTR_0:%.*]] = inttoptr i64 [[ORIGIN_ADDR]] to i32*
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN_0:%.*]] = load i32, i32* [[ORIGIN_PTR_0]], align 8
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_PTR_0:%.*]] = bitcast i[[#SBITS]]* [[SHADOW_PTR]] to i64*
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_0:%.*]] = load i64, i64* [[SHADOW_PTR_0]], align [[#SBYTES]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_PTR_1:%.*]] = getelementptr i64, i64* [[SHADOW_PTR_0]], i64 1
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_1:%.*]] = load i64, i64* [[SHADOW_PTR_1]], align [[#SBYTES]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_01:%.*]] = or i64 [[SHADOW_0]], [[SHADOW_1]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN_PTR_1:%.*]] = getelementptr i32, i32* [[ORIGIN_PTR_0]], i64 1
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN_1:%.*]] = load i32, i32* [[ORIGIN_PTR_1]], align 8
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_PTR_2:%.*]] = getelementptr i64, i64* [[SHADOW_PTR_1]], i64 1
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_2:%.*]] = load i64, i64* [[SHADOW_PTR_2]], align [[#SBYTES]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW64:%.*]] = or i64 [[SHADOW_01]], [[SHADOW_2]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN_PTR_2:%.*]] = getelementptr i32, i32* [[ORIGIN_PTR_1]], i64 1
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN_2:%.*]] = load i32, i32* [[ORIGIN_PTR_2]], align 8
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW64_H32:%.*]] = lshr i64 [[SHADOW64]], 32
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW64_HL32:%.*]] = or i64 [[SHADOW64]], [[SHADOW64_H32]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW64_HL32_H16:%.*]] = lshr i64 [[SHADOW64_HL32]], 16
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW64_HL32_HL16:%.*]] = or i64 [[SHADOW64_HL32]], [[SHADOW64_HL32_H16]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW:%.*]] = trunc i64 [[SHADOW64_HL32_HL16]] to i[[#SBITS]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_1_NZ:%.*]] = icmp ne i64 [[SHADOW_1]], 0
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN_10:%.*]] = select i1 [[SHADOW_1_NZ]], i32 [[ORIGIN_1]], i32 [[ORIGIN_0]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[SHADOW_2_NZ:%.*]] = icmp ne i64 [[SHADOW_2]], 0
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN:%.*]] = select i1 [[SHADOW_2_NZ]], i32 [[ORIGIN_2]], i32 [[ORIGIN_10]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: %a = load i92, i92* %p, align 8
-  ; NO_COMBINE_LOAD_PTR-NEXT: store i[[#SBITS]] [[SHADOW]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: store i32 [[ORIGIN]], i32* @__dfsan_retval_origin_tls, align 4
-  
-  %a = load i92, i92* %p
-  ret i92 %a
-}
-
-define i17 @load17(i17* %p) {
-  ; CHECK-LABEL: @"dfs$load17"
-
-  ; NO_COMBINE_LOAD_PTR-LABEL: @"dfs$load17"
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[INTP:%.*]] = bitcast i17* %p to i8*
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[LABEL_ORIGIN:%.*]] = call zeroext i64 @__dfsan_load_label_and_origin(i8* [[INTP]], i64 3)
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[LABEL_ORIGIN_H32:%.*]] = lshr i64 [[LABEL_ORIGIN]], 32
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[LABEL:%.*]] = trunc i64 [[LABEL_ORIGIN_H32]] to i[[#SBITS]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: [[ORIGIN:%.*]] = trunc i64 [[LABEL_ORIGIN]] to i32
-  ; NO_COMBINE_LOAD_PTR-NEXT: %a = load i17, i17* %p, align 4
-  ; NO_COMBINE_LOAD_PTR-NEXT: store i[[#SBITS]] [[LABEL]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[#SBYTES]]
-  ; NO_COMBINE_LOAD_PTR-NEXT: store i32 [[ORIGIN]], i32* @__dfsan_retval_origin_tls, align 4
-  
-  %a = load i17, i17* %p, align 4
-  ret i17 %a
-}
-
-define void @store_zero_to_non_escaped_alloca() {
-  ; CHECK-LABEL: @"dfs$store_zero_to_non_escaped_alloca"
-  ; CHECK-NEXT: [[A:%.*]] = alloca i[[#SBITS]], align [[#SBYTES]]
-  ; CHECK-NEXT: %_dfsa = alloca i32, align 4
-  ; CHECK-NEXT: %p = alloca i[[#SBITS]], align [[#SBYTES]]
-  ; CHECK-NEXT: store i[[#SBITS]] 0, i[[#SBITS]]* [[A]], align [[#SBYTES]]
-  ; CHECK-NEXT: store i16 1, i16* %p, align 2
-  ; CHECK-NEXT: ret void
-  
-  %p = alloca i16
-  store i16 1, i16* %p
-  ret void
-}
-
-define void @store_nonzero_to_non_escaped_alloca(i16 %a) {
-  ; CHECK-LABEL: @"dfs$store_nonzero_to_non_escaped_alloca"
-  ; CHECK: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK: %_dfsa = alloca i32, align 4
-  ; CHECK: store i32 [[AO]], i32* %_dfsa, align 4
-  
-  %p = alloca i16
-  store i16 %a, i16* %p
-  ret void
-}
-
-declare void @foo(i16* %p)
-
-define void @store_zero_to_escaped_alloca() {
-  ; CHECK-LABEL: @"dfs$store_zero_to_escaped_alloca"
-  ; CHECK: [[SA:%.*]] = bitcast i[[#SBITS]]* {{.*}} to i32*
-  ; CHECK-NEXT: store i32 0, i32* [[SA]], align 2
-  ; CHECK-NEXT: store i[[#SBITS]] 1, i[[#SBITS]]* %p, align 2
-  ; CHECK-NEXT: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[#SBYTES]]
-
-  ; COMBINE_STORE_PTR-LABEL: @"dfs$store_zero_to_escaped_alloca"
-  ; COMBINE_STORE_PTR: [[SA:%.*]] = bitcast i[[#SBITS]]* {{.*}} to i32*
-  ; COMBINE_STORE_PTR_NEXT: store i32 0, i32* [[SA]], align 2
-  ; COMBINE_STORE_PTR_NEXT: store i16 1, i16* %p, align 2
-  ; COMBINE_STORE_PTR_NEXT: call void @foo(i16* %p)
-  ; COMBINE_STORE_PTR_NEXT: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[#SBYTES]]
-
-  %p = alloca i16
-  store i16 1, i16* %p
-  call void @foo(i16* %p)
-  ret void
-}
-
-define void @store_nonzero_to_escaped_alloca(i16 %a) {
-  ; CHECK-LABEL: @"dfs$store_nonzero_to_escaped_alloca"
-  ; CHECK-NEXT: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK-NEXT: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[#SBYTES]]
-  ; CHECK: [[INTP:%.*]] = ptrtoint {{.*}} %p to i64
-  ; CHECK-NEXT: [[OFFSET:%.*]] = and i64 [[INTP]]
-  ; CHECK: [[ORIGIN_OFFSET:%.*]] = add i64 [[OFFSET]], 35184372088832
-  ; CHECK-NEXT: [[ORIGIN_ADDR:%.*]] = and i64 [[ORIGIN_OFFSET]], -4
-  ; CHECK-NEXT: [[ORIGIN_PTR:%.*]] = inttoptr i64 [[ORIGIN_ADDR]] to i32*
-  ; CHECK: %_dfscmp = icmp ne i[[#SBITS]] [[AS]], 0
-  ; CHECK-NEXT: br i1 %_dfscmp, label %[[L1:.*]], label %[[L2:.*]],
-  ; CHECK: [[L1]]:
-  ; CHECK-NEXT: [[NO:%.*]] = call i32 @__dfsan_chain_origin(i32 [[AO]])
-  ; CHECK-NEXT: store i32 [[NO]], i32* [[ORIGIN_PTR]], align 4
-  ; CHECK-NEXT: br label %[[L2]]
-  ; CHECK: [[L2]]:
-  ; CHECK-NEXT: store i16 %a, i16* %p, align 2
-  
-  ; COMBINE_STORE_PTR-LABEL: @"dfs$store_nonzero_to_escaped_alloca"
-  ; COMBINE_STORE_PTR-NEXT: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; COMBINE_STORE_PTR-NEXT: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[#SBYTES]]
-  ; COMBINE_STORE_PTR: [[INTP:%.*]] = ptrtoint {{.*}} %p to i64
-  ; COMBINE_STORE_PTR-NEXT: [[OFFSET:%.*]] = and i64 [[INTP]]
-  ; COMBINE_STORE_PTR: [[ORIGIN_OFFSET:%.*]] = add i64 [[OFFSET]], 35184372088832
-  ; COMBINE_STORE_PTR-NEXT: [[ORIGIN_ADDR:%.*]] = and i64 [[ORIGIN_OFFSET]], -4
-  ; COMBINE_STORE_PTR-NEXT: [[ORIGIN_PTR:%.*]] = inttoptr i64 [[ORIGIN_ADDR]] to i32*
-  ; COMBINE_STORE_PTR: %_dfscmp = icmp ne i[[#SBITS]] [[AS]], 0
-  ; COMBINE_STORE_PTR-NEXT: br i1 %_dfscmp, label %[[L1:.*]], label %[[L2:.*]],
-  ; COMBINE_STORE_PTR: [[L1]]:
-  ; COMBINE_STORE_PTR-NEXT: [[NO:%.*]] = call i32 @__dfsan_chain_origin(i32 [[AO]])
-  ; COMBINE_STORE_PTR-NEXT: store i32 [[NO]], i32* [[ORIGIN_PTR]], align 4
-  ; COMBINE_STORE_PTR-NEXT: br label %[[L2]]
-  ; COMBINE_STORE_PTR: [[L2]]:
-  ; COMBINE_STORE_PTR-NEXT: store i16 %a, i16* %p, align 2
-  
-  %p = alloca i16
-  store i16 %a, i16* %p
-  call void @foo(i16* %p)
-  ret void
-}
-
-define void @store64_align8(i64* %p, i64 %a) {
-  ; CHECK-LABEL: @"dfs$store64_align8"
-  ; CHECK-NEXT: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
-  ; CHECK-NEXT: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[#SBYTES]]
-  ; CHECK: %_dfscmp = icmp ne i[[#SBITS]] [[AS]], 0
-  ; CHECK-NEXT: br i1 %_dfscmp, label %[[L1:.*]], label %[[L2:.*]],
-  ; CHECK: [[L1]]:
-  ; CHECK-NEXT: [[NO:%.*]] = call i32 @__dfsan_chain_origin(i32 [[AO]])
-  ; CHECK-NEXT: [[NO_ZEXT:%.*]] = zext i32 [[NO]] to i64
-  ; CHECK-NEXT: [[NO_SHL:%.*]] = shl i64 [[NO_ZEXT]], 32
-  ; CHECK-NEXT: [[NO2:%.*]] = or i64 [[NO_ZEXT]], [[NO_SHL]]
-  ; CHECK-NEXT: [[O_PTR:%.*]] = bitcast i32* {{.*}} to i64*
-  ; CHECK-NEXT: store i64 [[NO2]], i64* [[O_PTR]], align 8
-  ; CHECK-NEXT: br label %[[L2]]
-  ; CHECK: [[L2]]:
-  ; CHECK-NEXT: store i64 %a, i64* %p, align 8
-  
-  ; COMBINE_STORE_PTR-LABEL: @"dfs$store64_align8"
-  ; COMBINE_STORE_PTR-NEXT: [[PO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; COMBINE_STORE_PTR-NEXT: [[PS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[#SBYTES]]
-  ; COMBINE_STORE_PTR-NEXT: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
-  ; COMBINE_STORE_PTR-NEXT: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[#SBYTES]]
-  ; COMBINE_STORE_PTR-NEXT: [[MS:%.*]] = or i[[#SBITS]] [[AS]], [[PS]]
-  ; COMBINE_STORE_PTR-NEXT: [[NE:%.*]] = icmp ne i[[#SBITS]] [[PS]], 0
-  ; COMBINE_STORE_PTR-NEXT: [[MO:%.*]] = select i1 [[NE]], i32 [[PO]], i32 [[AO]]
-  ; COMBINE_STORE_PTR: %_dfscmp = icmp ne i[[#SBITS]] [[MS]], 0
-  ; COMBINE_STORE_PTR-NEXT: br i1 %_dfscmp, label %[[L1:.*]], label %[[L2:.*]],
-  ; COMBINE_STORE_PTR: [[L1]]:
-  ; COMBINE_STORE_PTR-NEXT: [[NO:%.*]] = call i32 @__dfsan_chain_origin(i32 [[MO]])
-  ; COMBINE_STORE_PTR-NEXT: [[NO_ZEXT:%.*]] = zext i32 [[NO]] to i64
-  ; COMBINE_STORE_PTR-NEXT: [[NO_SHL:%.*]] = shl i64 [[NO_ZEXT]], 32
-  ; COMBINE_STORE_PTR-NEXT: [[NO2:%.*]] = or i64 [[NO_ZEXT]], [[NO_SHL]]
-  ; COMBINE_STORE_PTR-NEXT: [[O_PTR:%.*]] = bitcast i32* {{.*}} to i64*
-  ; COMBINE_STORE_PTR-NEXT: store i64 [[NO2]], i64* [[O_PTR]], align 8
-  ; COMBINE_STORE_PTR-NEXT: br label %[[L2]]
-  ; COMBINE_STORE_PTR: [[L2]]:
-  ; COMBINE_STORE_PTR-NEXT: store i64 %a, i64* %p, align 8
-  
-  store i64 %a, i64* %p
-  ret void
-}
-
-define void @store64_align2(i64* %p, i64 %a) {
-  ; CHECK-LABEL: @"dfs$store64_align2"
-  ; CHECK-NEXT: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
-  ; CHECK-NEXT: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[#SBYTES]]
-  ; CHECK: %_dfscmp = icmp ne i[[#SBITS]] [[AS]], 0
-  ; CHECK-NEXT: br i1 %_dfscmp, label %[[L1:.*]], label %[[L2:.*]],
-  ; CHECK: [[L1]]:
-  ; CHECK-NEXT: [[NO:%.*]] = call i32 @__dfsan_chain_origin(i32 [[AO]])
-  ; CHECK-NEXT: store i32 [[NO]], i32* [[O_PTR0:%.*]], align 4
-  ; CHECK-NEXT: [[O_PTR1:%.*]] = getelementptr i32, i32* [[O_PTR0]], i32 1
-  ; CHECK-NEXT: store i32 [[NO]], i32* [[O_PTR1]], align 4
-  ; CHECK: [[L2]]:
-  ; CHECK-NEXT: store i64 %a, i64* %p, align [[#SBYTES]]
-  
-  store i64 %a, i64* %p, align 2
-  ret void
-}
-
-define void @store96_align8(i96* %p, i96 %a) {
-  ; CHECK-LABEL: @"dfs$store96_align8"
-  ; CHECK-NEXT: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
-  ; CHECK-NEXT: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[#SBYTES]]
-  ; CHECK: %_dfscmp = icmp ne i[[#SBITS]] [[AS]], 0
-  ; CHECK-NEXT: br i1 %_dfscmp, label %[[L1:.*]], label %[[L2:.*]],
-  ; CHECK: [[L1]]:
-  ; CHECK-NEXT: [[NO:%.*]] = call i32 @__dfsan_chain_origin(i32 [[AO]])
-  ; CHECK-NEXT: [[NO_ZEXT:%.*]] = zext i32 [[NO]] to i64
-  ; CHECK-NEXT: [[NO_SHL:%.*]] = shl i64 [[NO_ZEXT]], 32
-  ; CHECK-NEXT: [[NO2:%.*]] = or i64 [[NO_ZEXT]], [[NO_SHL]]
-  ; CHECK-NEXT: [[O_PTR64:%.*]] = bitcast i32* [[O_PTR0:%.*]] to i64*
-  ; CHECK-NEXT: store i64 [[NO2]], i64* [[O_PTR64]], align 8
-  ; CHECK-NEXT: [[O_PTR1:%.*]] = getelementptr i32, i32* [[O_PTR0]], i32 2
-  ; CHECK-NEXT: store i32 [[NO]], i32* [[O_PTR1]], align 8
-  ; CHECK: [[L2]]:
-  ; CHECK-NEXT: store i96 %a, i96* %p, align 8
-  
-  store i96 %a, i96* %p, align 8
-  ret void
-}

diff  --git a/llvm/test/Instrumentation/DataFlowSanitizer/origin_load.ll b/llvm/test/Instrumentation/DataFlowSanitizer/origin_load.ll
new file mode 100644
index 0000000000000..35650c9450eb0
--- /dev/null
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/origin_load.ll
@@ -0,0 +1,327 @@
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels -S | FileCheck %s --check-prefixes=CHECK,CHECK8,COMBINE_LOAD_PTR
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels -dfsan-combine-pointer-labels-on-load=false -S | FileCheck %s --check-prefixes=CHECK,CHECK8
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels -S | FileCheck %s --check-prefixes=CHECK,CHECK16,COMBINE_LOAD_PTR
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels -dfsan-combine-pointer-labels-on-load=false -S | FileCheck %s --check-prefixes=CHECK,CHECK16
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: @__dfsan_shadow_width_bits = weak_odr constant i32 [[#SBITS:]]
+; CHECK: @__dfsan_shadow_width_bytes = weak_odr constant i32 [[#SBYTES:]]
+
+define {} @load0({}* %p) {
+  ; CHECK-LABEL: @"dfs$load0"
+  ; CHECK-NEXT: %a = load {}, {}* %p, align 1
+  ; CHECK-NEXT: store {} zeroinitializer, {}* bitcast ([100 x i64]* @__dfsan_retval_tls to {}*), align [[ALIGN:2]]
+  ; CHECK-NEXT: store i32 0, i32* @__dfsan_retval_origin_tls, align 4
+  ; CHECK-NEXT: ret {} %a
+
+  %a = load {}, {}* %p
+  ret {} %a
+}
+
+define i16 @load_non_escaped_alloca() {
+  ; CHECK-LABEL: @"dfs$load_non_escaped_alloca"
+  ; CHECK-NEXT: %[[#S_ALLOCA:]] = alloca i[[#SBITS]], align [[#SBYTES]]
+  ; CHECK-NEXT: %_dfsa = alloca i32, align 4
+  ; CHECK:      %[[#SHADOW:]] = load i[[#SBITS]], i[[#SBITS]]* %[[#S_ALLOCA]], align [[#SBYTES]]
+  ; CHECK-NEXT: %[[#ORIGIN:]] = load i32, i32* %_dfsa, align 4
+  ; CHECK-NEXT: %a = load i16, i16* %p, align 2
+  ; CHECK-NEXT: store i[[#SBITS]] %[[#SHADOW]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; CHECK-NEXT: store i32 %[[#ORIGIN]], i32* @__dfsan_retval_origin_tls, align 4
+  
+  %p = alloca i16
+  %a = load i16, i16* %p
+  ret i16 %a
+}
+
+define i16* @load_escaped_alloca() {
+  ; CHECK-LABEL:  @"dfs$load_escaped_alloca"
+  ; CHECK:        %[[#INTP:]] = ptrtoint i16* %p to i64
+  ; CHECK-NEXT:   %[[#SHADOW_ADDR:]] = and i64 %[[#INTP]], [[#%.10d,MASK:]]
+  ; CHECK16-NEXT: %[[#SHADOW_ADDR:]] = mul i64 %[[#SHADOW_ADDR]], 2
+  ; CHECK-NEXT:   %[[#SHADOW_PTR0:]] = inttoptr i64 %[[#SHADOW_ADDR]] to i[[#SBITS]]*
+  ; CHECK-NEXT:   %[[#ORIGIN_OFFSET:]] = add i64 %[[#INTP+1]], [[#%.10d,ORIGIN_MASK:]]
+  ; CHECK-NEXT:   %[[#ORIGIN_ADDR:]] = and i64 %[[#ORIGIN_OFFSET]], -4
+  ; CHECK-NEXT:   %[[#ORIGIN_PTR:]] = inttoptr i64 %[[#ORIGIN_ADDR]] to i32*
+  ; CHECK-NEXT:   {{%.*}} = load i32, i32* %[[#ORIGIN_PTR]], align 4
+  ; CHECK-NEXT:   %[[#SHADOW_PTR1:]] = getelementptr i[[#SBITS]], i[[#SBITS]]* %[[#SHADOW_PTR0]], i64 1
+  ; CHECK-NEXT:   %[[#SHADOW:]]  = load i[[#SBITS]], i[[#SBITS]]* %[[#SHADOW_PTR0]], align [[#SBYTES]]
+  ; CHECK-NEXT:   %[[#SHADOW+1]] = load i[[#SBITS]], i[[#SBITS]]* %[[#SHADOW_PTR1]], align [[#SBYTES]]
+  ; CHECK-NEXT:   {{%.*}} = or i[[#SBITS]] %[[#SHADOW]], %[[#SHADOW+1]]
+  ; CHECK-NEXT:   %a = load i16, i16* %p, align 2
+  ; CHECK-NEXT:   store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; CHECK-NEXT:   store i32 0, i32* @__dfsan_retval_origin_tls, align 4
+  
+  %p = alloca i16
+  %a = load i16, i16* %p
+  ret i16* %p
+}
+
+ at X = constant i1 1
+define i1 @load_global() {
+  ; CHECK-LABEL: @"dfs$load_global"
+  ; CHECK: %a = load i1, i1* @X, align 1
+  ; CHECK-NEXT: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; CHECK-NEXT: store i32 0, i32* @__dfsan_retval_origin_tls, align 4
+
+  %a = load i1, i1* @X
+  ret i1 %a
+}
+
+define i1 @load1(i1* %p) {
+  ; CHECK-LABEL:             @"dfs$load1"
+
+  ; COMBINE_LOAD_PTR-NEXT: %[[#PO:]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
+  ; COMBINE_LOAD_PTR-NEXT: %[[#PS:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
+
+  ; CHECK-NEXT:            %[[#INTP:]] = ptrtoint i1* %p to i64
+  ; CHECK-NEXT:            %[[#SHADOW_ADDR:]] = and i64 %[[#INTP]], [[#MASK]]
+  ; CHECK16-NEXT:          %[[#SHADOW_ADDR:]] = mul i64 %[[#SHADOW_ADDR]], 2
+  ; CHECK-NEXT:            %[[#SHADOW_PTR:]] = inttoptr i64 %[[#SHADOW_ADDR]] to i[[#SBITS]]*
+  ; CHECK-NEXT:            %[[#ORIGIN_OFFSET:]] = add i64 %[[#INTP+1]], [[#ORIGIN_MASK]]
+  ; CHECK-NEXT:            %[[#ORIGIN_ADDR:]] = and i64 %[[#ORIGIN_OFFSET]], -4
+  ; CHECK-NEXT:            %[[#ORIGIN_PTR:]] = inttoptr i64 %[[#ORIGIN_ADDR]] to i32*
+  ; CHECK-NEXT:            %[[#AO:]] = load i32, i32* %[[#ORIGIN_PTR]], align 4
+  ; CHECK-NEXT:            %[[#AS:]] = load i[[#SBITS]], i[[#SBITS]]* %[[#SHADOW_PTR]], align [[#SBYTES]]
+
+  ; COMBINE_LOAD_PTR-NEXT: %[[#AS:]] = or i[[#SBITS]] %[[#AS]], %[[#PS]]
+  ; COMBINE_LOAD_PTR-NEXT: %[[#NZ:]] = icmp ne i[[#SBITS]] %[[#PS]], 0
+  ; COMBINE_LOAD_PTR-NEXT: %[[#AO:]] = select i1 %[[#NZ]], i32 %[[#PO]], i32 %[[#AO]]
+
+  ; CHECK-NEXT:            %a = load i1, i1* %p, align 1
+  ; CHECK-NEXT:            store i[[#SBITS]] %[[#AS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; CHECK-NEXT:            store i32 %[[#AO]], i32* @__dfsan_retval_origin_tls, align 4
+
+  %a = load i1, i1* %p
+  ret i1 %a
+}
+
+define i16 @load16(i1 %i, i16* %p) {
+  ; CHECK-LABEL: @"dfs$load16"
+
+  ; COMBINE_LOAD_PTR-NEXT: %[[#PO:]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
+  ; COMBINE_LOAD_PTR-NEXT: %[[#PS:]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[ALIGN]]
+
+  ; CHECK-NEXT:            %[[#INTP:]] = ptrtoint i16* %p to i64
+  ; CHECK-NEXT:            %[[#SHADOW_ADDR:]] = and i64 %[[#INTP]], [[#MASK]]
+  ; CHECK16-NEXT:          %[[#SHADOW_ADDR:]] = mul i64 %[[#SHADOW_ADDR]], 2
+  ; CHECK-NEXT:            %[[#SHADOW_PTR0:]] = inttoptr i64 %[[#SHADOW_ADDR]] to i[[#SBITS]]*
+  ; CHECK-NEXT:            %[[#ORIGIN_OFFSET:]] = add i64 %[[#INTP+1]], [[#ORIGIN_MASK]]
+  ; CHECK-NEXT:            %[[#ORIGIN_ADDR:]] = and i64 %[[#ORIGIN_OFFSET]], -4
+  ; CHECK-NEXT:            %[[#ORIGIN_PTR:]] = inttoptr i64 %[[#ORIGIN_ADDR]] to i32*
+  ; CHECK-NEXT:            %[[#AO:]] = load i32, i32* %[[#ORIGIN_PTR]], align 4
+  ; CHECK-NEXT:            %[[#SHADOW_PTR1:]] = getelementptr i[[#SBITS]], i[[#SBITS]]* %[[#SHADOW_PTR0]], i64 1
+  ; CHECK-NEXT:            %[[#SHADOW:]]  = load i[[#SBITS]], i[[#SBITS]]* %[[#SHADOW_PTR0]], align [[#SBYTES]]
+  ; CHECK-NEXT:            %[[#SHADOW+1]] = load i[[#SBITS]], i[[#SBITS]]* %[[#SHADOW_PTR1]], align [[#SBYTES]]
+  ; CHECK-NEXT:            %[[#AS:]] = or i[[#SBITS]] %[[#SHADOW]], %[[#SHADOW+1]]
+
+  ; COMBINE_LOAD_PTR-NEXT: %[[#AS:]] = or i[[#SBITS]] %[[#AS]], %[[#PS]]
+  ; COMBINE_LOAD_PTR-NEXT: %[[#NZ:]] = icmp ne i[[#SBITS]] %[[#PS]], 0
+  ; COMBINE_LOAD_PTR-NEXT: %[[#AO:]] = select i1 %[[#NZ]], i32 %[[#PO]], i32 %[[#AO]]
+
+  ; CHECK-NEXT:            %a = load i16, i16* %p, align 2
+  ; CHECK-NEXT:            store i[[#SBITS]] %[[#AS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; CHECK-NEXT:            store i32 %[[#AO]], i32* @__dfsan_retval_origin_tls, align 4
+
+  %a = load i16, i16* %p
+  ret i16 %a
+}
+
+define i32 @load32(i32* %p) {
+  ; CHECK-LABEL: @"dfs$load32"
+
+  ; COMBINE_LOAD_PTR-NEXT: %[[#PO:]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
+  ; COMBINE_LOAD_PTR-NEXT: %[[#PS:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
+
+  ; CHECK:                 %[[#INTP:]] = ptrtoint i32* %p to i64
+  ; CHECK-NEXT:            %[[#SHADOW_ADDR:INTP+1]] = and i64 %[[#INTP]], [[#MASK]]
+  ; CHECK16-NEXT:          %[[#SHADOW_ADDR:]] = mul i64 %[[#SHADOW_ADDR]], 2
+  ; CHECK-NEXT:            %[[#SHADOW_PTR:]] = inttoptr i64 %[[#SHADOW_ADDR]] to i[[#SBITS]]*
+  ; CHECK-NEXT:            %[[#ORIGIN_ADDR:]] = add i64 %[[#INTP+1]], [[#ORIGIN_MASK]]
+  ; CHECK-NEXT:            %[[#ORIGIN_PTR:]] = inttoptr i64 %[[#ORIGIN_ADDR]] to i32*
+  ; CHECK-NEXT:            %[[#AO:]] = load i32, i32* %[[#ORIGIN_PTR]], align 4
+  ; CHECK-NEXT:            %[[#WIDE_SHADOW_PTR:]] = bitcast i[[#SBITS]]* %[[#SHADOW_PTR]] to i[[#WSBITS:mul(SBITS,4)]]*
+  ; CHECK-NEXT:            %[[#WIDE_SHADOW:]] = load i[[#WSBITS]], i[[#WSBITS]]* %[[#WIDE_SHADOW_PTR]], align [[#SBYTES]]
+  ; CHECK-NEXT:            %[[#WIDE_SHADOW+1]] = lshr i[[#WSBITS]] %[[#WIDE_SHADOW]], [[#mul(SBITS,2)]]
+  ; CHECK-NEXT:            %[[#WIDE_SHADOW+2]] = or i[[#WSBITS]] %[[#WIDE_SHADOW]], %[[#WIDE_SHADOW+1]]
+  ; CHECK-NEXT:            %[[#WIDE_SHADOW+3]] = lshr i[[#WSBITS]] %[[#WIDE_SHADOW+2]], [[#SBITS]]
+  ; CHECK-NEXT:            %[[#WIDE_SHADOW+4]] = or i[[#WSBITS]] %[[#WIDE_SHADOW+2]], %[[#WIDE_SHADOW+3]]
+  ; CHECK-NEXT:            %[[#SHADOW:]] = trunc i[[#WSBITS]] %[[#WIDE_SHADOW+4]] to i[[#SBITS]]
+
+  ; COMBINE_LOAD_PTR-NEXT: %[[#SHADOW:]] = or i[[#SBITS]] %[[#SHADOW]], %[[#PS]]
+  ; COMBINE_LOAD_PTR-NEXT: %[[#NZ:]] = icmp ne i[[#SBITS]] %[[#PS]], 0
+  ; COMBINE_LOAD_PTR-NEXT: %[[#AO:]] = select i1 %[[#NZ]], i32 %[[#PO]], i32 %[[#AO]]
+
+  ; CHECK-NEXT:            %a = load i32, i32* %p, align 4
+  ; CHECK-NEXT:            store i[[#SBITS]] %[[#SHADOW]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; CHECK-NEXT:            store i32 %[[#AO]], i32* @__dfsan_retval_origin_tls, align 4
+
+  %a = load i32, i32* %p
+  ret i32 %a
+}
+
+define i64 @load64(i64* %p) {
+  ; CHECK-LABEL: @"dfs$load64"
+
+  ; COMBINE_LOAD_PTR-NEXT: %[[#PO:]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
+  ; COMBINE_LOAD_PTR-NEXT: %[[#PS:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
+
+  ; CHECK:                 %[[#INTP:]] = ptrtoint i64* %p to i64
+  ; CHECK-NEXT:            %[[#SHADOW_ADDR:INTP+1]] = and i64 %[[#INTP]], [[#MASK]]
+  ; CHECK16-NEXT:          %[[#SHADOW_ADDR:]] = mul i64 %[[#SHADOW_ADDR]], 2
+  ; CHECK-NEXT:            %[[#SHADOW_PTR:]] = inttoptr i64 %[[#SHADOW_ADDR]] to i[[#SBITS]]*
+  ; CHECK-NEXT:            %[[#ORIGIN_ADDR:]] = add i64 %[[#INTP+1]], [[#ORIGIN_MASK]]
+  ; CHECK-NEXT:            %[[#ORIGIN_PTR:]] = inttoptr i64 %[[#ORIGIN_ADDR]] to i32*
+  ; CHECK-NEXT:            %[[#ORIGIN:]] = load i32, i32* %[[#ORIGIN_PTR]], align 8
+  ; CHECK-NEXT:            %[[#WIDE_SHADOW_PTR:]] = bitcast i[[#SBITS]]* %[[#SHADOW_PTR]] to i64*
+  ; CHECK-NEXT:            %[[#WIDE_SHADOW:]] = load i64, i64* %[[#WIDE_SHADOW_PTR]], align [[#SBYTES]]
+
+  ; COMM: On fast16, the 2x64 shadow bits need to be ORed first.
+  ; CHECK16-NEXT:          %[[#WIDE_SHADOW_PTR2:]] = getelementptr i64, i64* %[[#WIDE_SHADOW_PTR]], i64 1
+  ; CHECK16-NEXT:          %[[#WIDE_SHADOW2:]] = load i64, i64* %[[#WIDE_SHADOW_PTR2]], align [[#SBYTES]]
+  ; CHECK16-NEXT:          %[[#WIDE_SHADOW:]] = or i64 %[[#WIDE_SHADOW]], %[[#WIDE_SHADOW2]]
+  ; CHECK16-NEXT:          %[[#ORIGIN_PTR2:]] = getelementptr i32, i32* %[[#ORIGIN_PTR]], i64 1
+  ; CHECK16-NEXT:          %[[#ORIGIN2:]] = load i32, i32* %[[#ORIGIN_PTR2]], align 8
+  ; CHECK16-NEXT:          %[[#WIDE_SHADOW_SHIFTED:]] = lshr i64 %[[#WIDE_SHADOW]], 32
+  ; CHECK16-NEXT:          %[[#WIDE_SHADOW:]] = or i64 %[[#WIDE_SHADOW]], %[[#WIDE_SHADOW_SHIFTED]]
+  ; CHECK16-NEXT:          %[[#WIDE_SHADOW_SHIFTED:]] = lshr i64 %[[#WIDE_SHADOW]], 16
+  ; CHECK16-NEXT:          %[[#WIDE_SHADOW:]] = or i64 %[[#WIDE_SHADOW]], %[[#WIDE_SHADOW_SHIFTED]]
+  ; CHECK16-NEXT:          %[[#SHADOW:]] = trunc i64 %[[#WIDE_SHADOW]] to i[[#SBITS]]
+  ; CHECK16-NEXT:          %[[#SHADOW_NZ:]] = icmp ne i64 %[[#WIDE_SHADOW2]], 0
+  ; CHECK16-NEXT:          %[[#ORIGIN:]] = select i1 %[[#SHADOW_NZ]], i32 %[[#ORIGIN2]], i32 %[[#ORIGIN]]
+
+  ; COMM: On fast8, no need to OR the wide shadow but one more shift is needed.
+  ; CHECK8-NEXT:           %[[#WIDE_SHADOW_SHIFTED:]] = lshr i64 %[[#WIDE_SHADOW]], 32
+  ; CHECK8-NEXT:           %[[#WIDE_SHADOW:]] = or i64 %[[#WIDE_SHADOW]], %[[#WIDE_SHADOW_SHIFTED]]
+  ; CHECK8-NEXT:           %[[#WIDE_SHADOW_SHIFTED:]] = lshr i64 %[[#WIDE_SHADOW]], 16
+  ; CHECK8-NEXT:           %[[#WIDE_SHADOW:]] = or i64 %[[#WIDE_SHADOW]], %[[#WIDE_SHADOW_SHIFTED]]
+  ; CHECK8-NEXT:           %[[#WIDE_SHADOW_SHIFTED:]] = lshr i64 %[[#WIDE_SHADOW]], 8
+  ; CHECK8-NEXT:           %[[#WIDE_SHADOW:]] = or i64 %[[#WIDE_SHADOW]], %[[#WIDE_SHADOW_SHIFTED]]
+  ; CHECK8-NEXT:           %[[#SHADOW:]] = trunc i64 %[[#WIDE_SHADOW]] to i[[#SBITS]]
+
+  ; COMBINE_LOAD_PTR-NEXT: %[[#SHADOW:]] = or i[[#SBITS]] %[[#SHADOW]], %[[#PS]]
+  ; COMBINE_LOAD_PTR-NEXT: %[[#NZ:]] = icmp ne i[[#SBITS]] %[[#PS]], 0
+  ; COMBINE_LOAD_PTR-NEXT: %[[#ORIGIN:]] = select i1 %[[#NZ]], i32 %[[#PO]], i32 %[[#ORIGIN]]
+
+  ; CHECK-NEXT:            %a = load i64, i64* %p, align 8
+  ; CHECK-NEXT:            store i[[#SBITS]] %[[#SHADOW]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; CHECK-NEXT:            store i32 %[[#ORIGIN]], i32* @__dfsan_retval_origin_tls, align 4
+  
+  %a = load i64, i64* %p
+  ret i64 %a
+}
+
+define i64 @load64_align2(i64* %p) {
+  ; CHECK-LABEL: @"dfs$load64_align2"
+
+  ; COMBINE_LOAD_PTR-NEXT: %[[#PO:]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
+  ; COMBINE_LOAD_PTR-NEXT: %[[#PS:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
+
+  ; CHECK-NEXT:            %[[#INTP:]] = bitcast i64* %p to i8*
+  ; CHECK-NEXT:            %[[#LABEL_ORIGIN:]] = call zeroext i64 @__dfsan_load_label_and_origin(i8* %[[#INTP]], i64 8)
+  ; CHECK-NEXT:            %[[#LABEL_ORIGIN+1]] = lshr i64 %[[#LABEL_ORIGIN]], 32
+  ; CHECK-NEXT:            %[[#LABEL:]] = trunc i64 %[[#LABEL_ORIGIN+1]] to i[[#SBITS]]
+  ; CHECK-NEXT:            %[[#ORIGIN:]] = trunc i64 %[[#LABEL_ORIGIN]] to i32
+
+  ; COMBINE_LOAD_PTR-NEXT: %[[#LABEL:]] = or i[[#SBITS]] %[[#LABEL]], %[[#PS]]
+  ; COMBINE_LOAD_PTR-NEXT: %[[#NZ:]] = icmp ne i[[#SBITS]] %[[#PS]], 0
+  ; COMBINE_LOAD_PTR-NEXT: %[[#ORIGIN:]] = select i1 %[[#NZ]], i32 %[[#PO]], i32 %[[#ORIGIN]]
+
+  ; CHECK-NEXT:            %a = load i64, i64* %p, align 2
+  ; CHECK-NEXT:            store i[[#SBITS]] %[[#LABEL]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; CHECK-NEXT:            store i32 %[[#ORIGIN]], i32* @__dfsan_retval_origin_tls, align 4
+
+  %a = load i64, i64* %p, align 2
+  ret i64 %a
+}
+
+define i128 @load128(i128* %p) {
+  ; CHECK-LABEL: @"dfs$load128"
+
+  ; COMBINE_LOAD_PTR-NEXT: %[[#PO:]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
+  ; COMBINE_LOAD_PTR-NEXT: %[[#PS:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
+
+  ; CHECK:                 %[[#INTP:]] = ptrtoint i128* %p to i64
+  ; CHECK-NEXT:            %[[#SHADOW_ADDR:INTP+1]] = and i64 %[[#INTP]], [[#MASK]]
+  ; CHECK16-NEXT:          %[[#SHADOW_ADDR:]] = mul i64 %[[#SHADOW_ADDR]], 2
+  ; CHECK-NEXT:            %[[#SHADOW_PTR:]] = inttoptr i64 %[[#SHADOW_ADDR]] to i[[#SBITS]]*
+  ; CHECK-NEXT:            %[[#ORIGIN_ADDR:]] = add i64 %[[#INTP+1]], [[#ORIGIN_MASK]]
+  ; CHECK-NEXT:            %[[#ORIGIN_PTR:]] = inttoptr i64 %[[#ORIGIN_ADDR]] to i32*
+  ; CHECK-NEXT:            %[[#ORIGIN:]] = load i32, i32* %[[#ORIGIN_PTR]], align 8
+  ; CHECK-NEXT:            %[[#WIDE_SHADOW_PTR:]] = bitcast i[[#SBITS]]* %[[#SHADOW_PTR]] to i64*
+  ; CHECK-NEXT:            %[[#WIDE_SHADOW:]] = load i64, i64* %[[#WIDE_SHADOW_PTR]], align [[#SBYTES]]
+  ; CHECK-NEXT:            %[[#WIDE_SHADOW_PTR2:]] = getelementptr i64, i64* %[[#WIDE_SHADOW_PTR]], i64 1
+  ; CHECK-NEXT:            %[[#WIDE_SHADOW2:]] = load i64, i64* %[[#WIDE_SHADOW_PTR2]], align [[#SBYTES]]
+  ; CHECK-NEXT:            %[[#WIDE_SHADOW:]] = or i64 %[[#WIDE_SHADOW]], %[[#WIDE_SHADOW2]]
+  ; CHECK-NEXT:            %[[#ORIGIN_PTR2:]] = getelementptr i32, i32* %[[#ORIGIN_PTR]], i64 1
+  ; CHECK-NEXT:            %[[#ORIGIN2:]] = load i32, i32* %[[#ORIGIN_PTR2]], align 8
+
+  ; COMM: On fast16, we need to OR 4x64bits for the wide shadow, before ORing its bytes.
+  ; CHECK16-NEXT:          %[[#WIDE_SHADOW_PTR3:]] = getelementptr i64, i64* %[[#WIDE_SHADOW_PTR2]], i64 1
+  ; CHECK16-NEXT:          %[[#WIDE_SHADOW3:]] = load i64, i64* %[[#WIDE_SHADOW_PTR3]], align [[#SBYTES]]
+  ; CHECK16-NEXT:          %[[#WIDE_SHADOW:]] = or i64 %[[#WIDE_SHADOW]], %[[#WIDE_SHADOW3]]
+  ; CHECK16-NEXT:          %[[#ORIGIN_PTR3:]] = getelementptr i32, i32* %[[#ORIGIN_PTR2]], i64 1
+  ; CHECK16-NEXT:          %[[#ORIGIN3:]] = load i32, i32* %[[#ORIGIN_PTR3]], align 8
+  ; CHECK16-NEXT:          %[[#WIDE_SHADOW_PTR4:]] = getelementptr i64, i64* %[[#WIDE_SHADOW_PTR3]], i64 1
+  ; CHECK16-NEXT:          %[[#WIDE_SHADOW4:]] = load i64, i64* %[[#WIDE_SHADOW_PTR4]], align [[#SBYTES]]
+  ; CHECK16-NEXT:          %[[#WIDE_SHADOW:]] = or i64 %[[#WIDE_SHADOW]], %[[#WIDE_SHADOW4]]
+  ; CHECK16-NEXT:          %[[#ORIGIN_PTR4:]] = getelementptr i32, i32* %[[#ORIGIN_PTR3]], i64 1
+  ; CHECK16-NEXT:          %[[#ORIGIN4:]] = load i32, i32* %[[#ORIGIN_PTR4]], align 8
+  ; CHECK16-NEXT:          %[[#WIDE_SHADOW_SHIFTED:]] = lshr i64 %[[#WIDE_SHADOW]], 32
+  ; CHECK16-NEXT:          %[[#WIDE_SHADOW:]] = or i64 %[[#WIDE_SHADOW]], %[[#WIDE_SHADOW_SHIFTED]]
+  ; CHECK16-NEXT:          %[[#WIDE_SHADOW_SHIFTED:]] = lshr i64 %[[#WIDE_SHADOW]], 16
+  ; CHECK16-NEXT:          %[[#WIDE_SHADOW:]] = or i64 %[[#WIDE_SHADOW]], %[[#WIDE_SHADOW_SHIFTED]]
+  ; CHECK16-NEXT:          %[[#SHADOW:]] = trunc i64 %[[#WIDE_SHADOW]] to i[[#SBITS]]
+  ; CHECK16-NEXT:          %[[#SHADOW2_NZ:]] = icmp ne i64 %[[#WIDE_SHADOW2]], 0
+  ; CHECK16-NEXT:          %[[#ORIGIN:]] = select i1 %[[#SHADOW2_NZ]], i32 %[[#ORIGIN2]], i32 %[[#ORIGIN]]
+  ; CHECK16-NEXT:          %[[#SHADOW3_NZ:]] = icmp ne i64 %[[#WIDE_SHADOW3]], 0
+  ; CHECK16-NEXT:          %[[#ORIGIN:]] = select i1 %[[#SHADOW3_NZ]], i32 %[[#ORIGIN3]], i32 %[[#ORIGIN]]
+  ; CHECK16-NEXT:          %[[#SHADOW4_NZ:]] = icmp ne i64 %[[#WIDE_SHADOW4]], 0
+  ; CHECK16-NEXT:          %[[#ORIGIN:]] = select i1 %[[#SHADOW4_NZ]], i32 %[[#ORIGIN4]], i32 %[[#ORIGIN]]
+  
+  ; COMM: On fast8, we need to OR 2x64bits for the wide shadow, before ORing its bytes (one more shift).
+  ; CHECK8-NEXT:           %[[#WIDE_SHADOW_SHIFTED:]] = lshr i64 %[[#WIDE_SHADOW]], 32
+  ; CHECK8-NEXT:           %[[#WIDE_SHADOW:]] = or i64 %[[#WIDE_SHADOW]], %[[#WIDE_SHADOW_SHIFTED]]
+  ; CHECK8-NEXT:           %[[#WIDE_SHADOW_SHIFTED:]] = lshr i64 %[[#WIDE_SHADOW]], 16
+  ; CHECK8-NEXT:           %[[#WIDE_SHADOW:]] = or i64 %[[#WIDE_SHADOW]], %[[#WIDE_SHADOW_SHIFTED]]
+  ; CHECK8-NEXT:           %[[#WIDE_SHADOW_SHIFTED:]] = lshr i64 %[[#WIDE_SHADOW]], 8
+  ; CHECK8-NEXT:           %[[#WIDE_SHADOW:]] = or i64 %[[#WIDE_SHADOW]], %[[#WIDE_SHADOW_SHIFTED]]
+  ; CHECK8-NEXT:           %[[#SHADOW:]] = trunc i64 %[[#WIDE_SHADOW]] to i[[#SBITS]]
+  ; CHECK8-NEXT:           %[[#SHADOW2_NZ:]] = icmp ne i64 %[[#WIDE_SHADOW2]], 0
+  ; CHECK8-NEXT:           %[[#ORIGIN:]] = select i1 %[[#SHADOW2_NZ]], i32 %[[#ORIGIN2]], i32 %[[#ORIGIN]]
+
+  ; COMBINE_LOAD_PTR-NEXT: %[[#SHADOW:]] = or i[[#SBITS]] %[[#SHADOW]], %[[#PS]]
+  ; COMBINE_LOAD_PTR-NEXT: %[[#NZ:]] = icmp ne i[[#SBITS]] %[[#PS]], 0
+  ; COMBINE_LOAD_PTR-NEXT: %[[#ORIGIN:]] = select i1 %[[#NZ]], i32 %[[#PO]], i32 %[[#ORIGIN]]
+
+  ; CHECK-NEXT:            %a = load i128, i128* %p, align 8
+  ; CHECK-NEXT:            store i[[#SBITS]] %[[#SHADOW]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; CHECK-NEXT:            store i32 %[[#ORIGIN]], i32* @__dfsan_retval_origin_tls, align 4
+  
+  %a = load i128, i128* %p
+  ret i128 %a
+}
+
+define i17 @load17(i17* %p) {
+  ; CHECK-LABEL: @"dfs$load17"
+
+  ; COMBINE_LOAD_PTR-NEXT: %[[#PO:]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
+  ; COMBINE_LOAD_PTR-NEXT: %[[#PS:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
+
+  ; CHECK-NEXT: %[[#INTP:]] = bitcast i17* %p to i8*
+  ; CHECK-NEXT: %[[#LABEL_ORIGIN:]] = call zeroext i64 @__dfsan_load_label_and_origin(i8* %[[#INTP]], i64 3)
+  ; CHECK-NEXT: %[[#LABEL_ORIGIN_H32:]] = lshr i64 %[[#LABEL_ORIGIN]], 32
+  ; CHECK-NEXT: %[[#LABEL:]] = trunc i64 %[[#LABEL_ORIGIN_H32]] to i[[#SBITS]]
+  ; CHECK-NEXT: %[[#ORIGIN:]] = trunc i64 %[[#LABEL_ORIGIN]] to i32
+
+  ; COMBINE_LOAD_PTR-NEXT: %[[#LABEL:]] = or i[[#SBITS]] %[[#LABEL]], %[[#PS]]
+  ; COMBINE_LOAD_PTR-NEXT: %[[#NZ:]] = icmp ne i[[#SBITS]] %[[#PS]], 0
+  ; COMBINE_LOAD_PTR-NEXT: %[[#ORIGIN:]] = select i1 %[[#NZ]], i32 %[[#PO]], i32 %[[#ORIGIN]]
+
+  ; CHECK-NEXT: %a = load i17, i17* %p, align 4
+  ; CHECK-NEXT: store i[[#SBITS]] %[[#LABEL]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; CHECK-NEXT: store i32 %[[#ORIGIN]], i32* @__dfsan_retval_origin_tls, align 4
+
+  %a = load i17, i17* %p, align 4
+  ret i17 %a
+}

diff  --git a/llvm/test/Instrumentation/DataFlowSanitizer/origin_mem_intrinsic.ll b/llvm/test/Instrumentation/DataFlowSanitizer/origin_mem_intrinsic.ll
index 1adb346361f69..6edb6d566b0f6 100644
--- a/llvm/test/Instrumentation/DataFlowSanitizer/origin_mem_intrinsic.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/origin_mem_intrinsic.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefix=CHECK
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels=true  -S | FileCheck %s
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -13,7 +14,7 @@ define void @memcpy(i8* %d, i8* %s, i32 %l) {
   ; CHECK: @"dfs$memcpy"
   ; CHECK: [[L64:%.*]] = zext i32 %l to i64
   ; CHECK: call void @__dfsan_mem_origin_transfer(i8* %d, i8* %s, i64 [[L64]])
-  ; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 {{.*}}, i8* align 2 {{.*}}, i32 {{.*}}, i1 false)
+  ; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align [[#SBYTES]] {{.*}}, i8* align [[#SBYTES]] {{.*}}, i32 {{.*}}, i1 false)
   ; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %d, i8* %s, i32 %l, i1 false)
 
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %d, i8* %s, i32 %l, i1 0)
@@ -24,7 +25,7 @@ define void @memmove(i8* %d, i8* %s, i32 %l) {
   ; CHECK: @"dfs$memmove"
   ; CHECK: [[L64:%.*]] = zext i32 %l to i64
   ; CHECK: call void @__dfsan_mem_origin_transfer(i8* %d, i8* %s, i64 [[L64]])
-  ; CHECK: call void @llvm.memmove.p0i8.p0i8.i32(i8* align 2 {{.*}}, i8* align 2 {{.*}}, i32 {{.*}}, i1 false)
+  ; CHECK: call void @llvm.memmove.p0i8.p0i8.i32(i8* align [[#SBYTES]] {{.*}}, i8* align [[#SBYTES]] {{.*}}, i32 {{.*}}, i1 false)
   ; CHECK: call void @llvm.memmove.p0i8.p0i8.i32(i8* %d, i8* %s, i32 %l, i1 false)
 
   call void @llvm.memmove.p0i8.p0i8.i32(i8* %d, i8* %s, i32 %l, i1 0)
@@ -34,7 +35,7 @@ define void @memmove(i8* %d, i8* %s, i32 %l) {
 define void @memset(i8* %p, i8 %v) {
   ; CHECK: @"dfs$memset"
   ; CHECK: [[O:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
-  ; CHECK: [[S:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[#SBYTES]]
+  ; CHECK: [[S:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[ALIGN:2]]
   ; CHECK: call void @__dfsan_set_label(i[[#SBITS]] [[S]], i32 [[O]], i8* %p, i64 1)
   call void @llvm.memset.p0i8.i64(i8* %p, i8 %v, i64 1, i1 1)
   ret void

diff  --git a/llvm/test/Instrumentation/DataFlowSanitizer/origin_other_ops.ll b/llvm/test/Instrumentation/DataFlowSanitizer/origin_other_ops.ll
index 1d71aa08e82c4..ccd6164ee0a40 100644
--- a/llvm/test/Instrumentation/DataFlowSanitizer/origin_other_ops.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/origin_other_ops.ll
@@ -1,7 +1,13 @@
-; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefix=CHECK
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels=true  -S | FileCheck %s
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+; CHECK: @__dfsan_arg_tls = external thread_local(initialexec) global [[TLS_ARR:\[100 x i64\]]]
+; CHECK: @__dfsan_retval_tls = external thread_local(initialexec) global [[TLS_ARR]]
+; CHECK: @__dfsan_shadow_width_bits = weak_odr constant i32 [[#SBITS:]]
+; CHECK: @__dfsan_shadow_width_bytes = weak_odr constant i32 [[#SBYTES:]]
+
 define float @unop(float %f) {
   ; CHECK: @"dfs$unop"
   ; CHECK: [[FO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
@@ -15,8 +21,8 @@ define i1 @binop(i1 %a, i1 %b) {
   ; CHECK: @"dfs$binop"
   ; CHECK: [[BO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
   ; CHECK: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK: [[BS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
-  ; CHECK: [[NE:%.*]] = icmp ne i16 [[BS]], 0
+  ; CHECK: [[BS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
+  ; CHECK: [[NE:%.*]] = icmp ne i[[#SBITS]] [[BS]], 0
   ; CHECK: [[MO:%.*]] = select i1 [[NE]], i32 [[BO]], i32 [[AO]]
   ; CHECK: store i32 [[MO]], i32* @__dfsan_retval_origin_tls, align 4
 
@@ -37,8 +43,8 @@ define i1 @cmpop(i1 %a, i1 %b) {
   ; CHECK: @"dfs$cmpop"
   ; CHECK: [[BO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
   ; CHECK: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK: [[BS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
-  ; CHECK: [[NE:%.*]] = icmp ne i16 [[BS]], 0
+  ; CHECK: [[BS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
+  ; CHECK: [[NE:%.*]] = icmp ne i[[#SBITS]] [[BS]], 0
   ; CHECK: [[MO:%.*]] = select i1 [[NE]], i32 [[BO]], i32 [[AO]]
   ; CHECK: store i32 [[MO]], i32* @__dfsan_retval_origin_tls, align 4
 
@@ -52,14 +58,14 @@ define i32* @gepop([10 x [20 x i32]]* %p, i32 %a, i32 %b, i32 %c) {
   ; CHECK: [[BO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 2), align 4
   ; CHECK: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
   ; CHECK: [[PO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK: [[CS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 6) to i16*), align 2
-  ; CHECK: [[BS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 4) to i16*), align 2
-  ; CHECK: [[AS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
-  ; CHECK: [[AS_NE:%.*]] = icmp ne i16 [[AS]], 0
+  ; CHECK: [[CS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 6) to i[[#SBITS]]*), align 2
+  ; CHECK: [[BS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 4) to i[[#SBITS]]*), align 2
+  ; CHECK: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
+  ; CHECK: [[AS_NE:%.*]] = icmp ne i[[#SBITS]] [[AS]], 0
   ; CHECK: [[APO:%.*]] = select i1 [[AS_NE]], i32 [[AO]], i32 [[PO]]
-  ; CHECK: [[BS_NE:%.*]] = icmp ne i16 [[BS]], 0
+  ; CHECK: [[BS_NE:%.*]] = icmp ne i[[#SBITS]] [[BS]], 0
   ; CHECK: [[ABPO:%.*]] = select i1 [[BS_NE]], i32 [[BO]], i32 [[APO]]
-  ; CHECK: [[CS_NE:%.*]] = icmp ne i16 [[CS]], 0
+  ; CHECK: [[CS_NE:%.*]] = icmp ne i[[#SBITS]] [[CS]], 0
   ; CHECK: [[ABCPO:%.*]] = select i1 [[CS_NE]], i32 [[CO]], i32 [[ABPO]]
   ; CHECK: store i32 [[ABCPO]], i32* @__dfsan_retval_origin_tls, align 4
 
@@ -71,8 +77,8 @@ define i32 @eeop(<4 x i32> %a, i32 %b) {
   ; CHECK: @"dfs$eeop"
   ; CHECK: [[BO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
   ; CHECK: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK: [[BS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
-  ; CHECK: [[NE:%.*]] = icmp ne i16 [[BS]], 0
+  ; CHECK: [[BS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
+  ; CHECK: [[NE:%.*]] = icmp ne i[[#SBITS]] [[BS]], 0
   ; CHECK: [[MO:%.*]] = select i1 [[NE]], i32 [[BO]], i32 [[AO]]
   ; CHECK: store i32 [[MO]], i32* @__dfsan_retval_origin_tls, align 4
 
@@ -85,11 +91,11 @@ define <4 x i32> @ieop(<4 x i32> %p, i32 %a, i32 %b) {
   ; CHECK: [[BO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 2), align 4
   ; CHECK: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
   ; CHECK: [[PO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK: [[BS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 4) to i16*), align 2
-  ; CHECK: [[AS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
-  ; CHECK: [[AS_NE:%.*]] = icmp ne i16 [[AS]], 0
+  ; CHECK: [[BS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 4) to i[[#SBITS]]*), align 2
+  ; CHECK: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
+  ; CHECK: [[AS_NE:%.*]] = icmp ne i[[#SBITS]] [[AS]], 0
   ; CHECK: [[APO:%.*]] = select i1 [[AS_NE]], i32 [[AO]], i32 [[PO]]
-  ; CHECK: [[BS_NE:%.*]] = icmp ne i16 [[BS]], 0
+  ; CHECK: [[BS_NE:%.*]] = icmp ne i[[#SBITS]] [[BS]], 0
   ; CHECK: [[ABPO:%.*]] = select i1 [[BS_NE]], i32 [[BO]], i32 [[APO]]
   ; CHECK: store i32 [[ABPO]], i32* @__dfsan_retval_origin_tls, align 4
 
@@ -101,8 +107,8 @@ define <4 x i32> @svop(<4 x i32> %a, <4 x i32> %b) {
   ; CHECK: @"dfs$svop"
   ; CHECK: [[BO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
   ; CHECK: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK: [[BS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i16*), align 2
-  ; CHECK: [[NE:%.*]] = icmp ne i16 [[BS]], 0
+  ; CHECK: [[BS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align 2
+  ; CHECK: [[NE:%.*]] = icmp ne i[[#SBITS]] [[BS]], 0
   ; CHECK: [[MO:%.*]] = select i1 [[NE]], i32 [[BO]], i32 [[AO]]
   ; CHECK: store i32 [[MO]], i32* @__dfsan_retval_origin_tls, align 4
   
@@ -123,11 +129,13 @@ define {i32, {float, float}} @ivop({i32, {float, float}} %a, {float, float} %b)
   ; CHECK: @"dfs$ivop"
   ; CHECK: [[BO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
   ; CHECK: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK: [[BS:%.*]] = load { i16, i16 }, { i16, i16 }* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 6) to { i16, i16 }*), align 2
-  ; CHECK: [[BS0:%.*]] = extractvalue { i16, i16 } [[BS]], 0
-  ; CHECK: [[BS1:%.*]] = extractvalue { i16, i16 } [[BS]], 1
-  ; CHECK: [[BS01:%.*]] = or i16 [[BS0]], [[BS1]]
-  ; CHECK: [[NE:%.*]] = icmp ne i16 [[BS01]], 0
+  ; COMM: TODO simplify the expression [[#mul(2,SBYTES) + max(SBYTES,2)]] to
+  ; COMM: [[#mul(3,SBYTES)]], if shadow-tls-alignment is updated to match shadow
+  ; CHECK: [[BS:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 [[#mul(2,SBYTES) + max(SBYTES,2)]]) to { i[[#SBITS]], i[[#SBITS]] }*), align 2
+  ; CHECK: [[BS0:%.*]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } [[BS]], 0
+  ; CHECK: [[BS1:%.*]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } [[BS]], 1
+  ; CHECK: [[BS01:%.*]] = or i[[#SBITS]] [[BS0]], [[BS1]]
+  ; CHECK: [[NE:%.*]] = icmp ne i[[#SBITS]] [[BS01]], 0
   ; CHECK: [[MO:%.*]] = select i1 [[NE]], i32 [[BO]], i32 [[AO]]
   ; CHECK: store i32 [[MO]], i32* @__dfsan_retval_origin_tls, align 4
   

diff  --git a/llvm/test/Instrumentation/DataFlowSanitizer/origin_phi.ll b/llvm/test/Instrumentation/DataFlowSanitizer/origin_phi.ll
index 1b260b56bab75..a5d8b46603686 100644
--- a/llvm/test/Instrumentation/DataFlowSanitizer/origin_phi.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/origin_phi.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefix=CHECK
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels=true  -S | FileCheck %s
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -11,8 +12,8 @@ define i32 @phiop(i32 %a, i32 %b, i1 %c) {
   ; CHECK: entry:
   ; CHECK: [[BO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
   ; CHECK: [[AO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
-  ; CHECK: [[BS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[#SBYTES]]
-  ; CHECK: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[#SBYTES]]
+  ; CHECK: [[BS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[ALIGN:2]]
+  ; CHECK: [[AS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
   ; CHECK: br i1 %c, label %next, label %done
   ; CHECK: next:
   ; CHECK: br i1 %c, label %T, label %F

diff  --git a/llvm/test/Instrumentation/DataFlowSanitizer/origin_select.ll b/llvm/test/Instrumentation/DataFlowSanitizer/origin_select.ll
index 9690dcf9316f1..53849701ff582 100644
--- a/llvm/test/Instrumentation/DataFlowSanitizer/origin_select.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/origin_select.ll
@@ -1,16 +1,23 @@
-; RUN: opt < %s -dfsan -dfsan-track-select-control-flow=1 -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefix=TRACK_CONTROL_FLOW
-; RUN: opt < %s -dfsan -dfsan-track-select-control-flow=0 -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefix=NO_TRACK_CONTROL_FLOW
+; RUN: opt < %s -dfsan -dfsan-track-select-control-flow=1 -dfsan-track-origins=1 -dfsan-fast-8-labels=true  -S | FileCheck %s --check-prefixes=CHECK,TRACK_CONTROL_FLOW
+; RUN: opt < %s -dfsan -dfsan-track-select-control-flow=0 -dfsan-track-origins=1 -dfsan-fast-8-labels=true  -S | FileCheck %s --check-prefixes=CHECK,NO_TRACK_CONTROL_FLOW
+; RUN: opt < %s -dfsan -dfsan-track-select-control-flow=1 -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,TRACK_CONTROL_FLOW
+; RUN: opt < %s -dfsan -dfsan-track-select-control-flow=0 -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,NO_TRACK_CONTROL_FLOW
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+; CHECK: @__dfsan_arg_tls = external thread_local(initialexec) global [[TLS_ARR:\[100 x i64\]]]
+; CHECK: @__dfsan_retval_tls = external thread_local(initialexec) global [[TLS_ARR]]
+; CHECK: @__dfsan_shadow_width_bits = weak_odr constant i32 [[#SBITS:]]
+; CHECK: @__dfsan_shadow_width_bytes = weak_odr constant i32 [[#SBYTES:]]
+
 define i8 @select8(i1 %c, i8 %t, i8 %f) {
   ; TRACK_CONTROL_FLOW: @"dfs$select8"
   ; TRACK_CONTROL_FLOW: [[CO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
   ; TRACK_CONTROL_FLOW: [[FO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 2), align 4
   ; TRACK_CONTROL_FLOW: [[TO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
-  ; TRACK_CONTROL_FLOW: [[CS:%.*]] = load i16, i16* bitcast ([100 x i64]* @__dfsan_arg_tls to i16*), align 2
+  ; TRACK_CONTROL_FLOW: [[CS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align 2
   ; TRACK_CONTROL_FLOW: [[TFO:%.*]] = select i1 %c, i32 [[TO]], i32 [[FO]]
-  ; TRACK_CONTROL_FLOW: [[CS_NE:%.*]] = icmp ne i16 [[CS]], 0
+  ; TRACK_CONTROL_FLOW: [[CS_NE:%.*]] = icmp ne i[[#SBITS]] [[CS]], 0
   ; TRACK_CONTROL_FLOW: [[CTFO:%.*]] = select i1 [[CS_NE]], i32 [[CO]], i32 [[TFO]]
   ; TRACK_CONTROL_FLOW: store i32 [[CTFO]], i32* @__dfsan_retval_origin_tls, align 4
 
@@ -28,8 +35,8 @@ define i8 @select8e(i1 %c, i8 %tf) {
   ; TRACK_CONTROL_FLOW: @"dfs$select8e"
   ; TRACK_CONTROL_FLOW: [[CO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
   ; TRACK_CONTROL_FLOW: [[TFO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
-  ; TRACK_CONTROL_FLOW: [[CS:%.*]] = load i16, i16* bitcast ([100 x i64]* @__dfsan_arg_tls to i16*), align 2
-  ; TRACK_CONTROL_FLOW: [[CS_NE:%.*]] = icmp ne i16 [[CS]], 0
+  ; TRACK_CONTROL_FLOW: [[CS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align 2
+  ; TRACK_CONTROL_FLOW: [[CS_NE:%.*]] = icmp ne i[[#SBITS]] [[CS]], 0
   ; TRACK_CONTROL_FLOW: [[CTFO:%.*]] = select i1 [[CS_NE]], i32 [[CO]], i32 [[TFO]]
   ; TRACK_CONTROL_FLOW: store i32 [[CTFO]], i32* @__dfsan_retval_origin_tls, align 4
 
@@ -46,19 +53,19 @@ define <4 x i8> @select8v(<4 x i1> %c, <4 x i8> %t, <4 x i8> %f) {
   ; TRACK_CONTROL_FLOW: [[CO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
   ; TRACK_CONTROL_FLOW: [[FO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 2), align 4
   ; TRACK_CONTROL_FLOW: [[TO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
-  ; TRACK_CONTROL_FLOW: [[FS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 4) to i16*), align 2
-  ; TRACK_CONTROL_FLOW: [[CS:%.*]] = load i16, i16* bitcast ([100 x i64]* @__dfsan_arg_tls to i16*), align 2
-  ; TRACK_CONTROL_FLOW: [[FS_NE:%.*]] = icmp ne i16 [[FS]], 0
+  ; TRACK_CONTROL_FLOW: [[FS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 4) to i[[#SBITS]]*), align 2
+  ; TRACK_CONTROL_FLOW: [[CS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align 2
+  ; TRACK_CONTROL_FLOW: [[FS_NE:%.*]] = icmp ne i[[#SBITS]] [[FS]], 0
   ; TRACK_CONTROL_FLOW: [[FTO:%.*]] = select i1 [[FS_NE]], i32 [[FO]], i32 [[TO]]
-  ; TRACK_CONTROL_FLOW: [[CS_NE:%.*]] = icmp ne i16 [[CS]], 0
+  ; TRACK_CONTROL_FLOW: [[CS_NE:%.*]] = icmp ne i[[#SBITS]] [[CS]], 0
   ; TRACK_CONTROL_FLOW: [[CFTO:%.*]] = select i1 [[CS_NE]], i32 [[CO]], i32 [[FTO]]
   ; TRACK_CONTROL_FLOW: store i32 [[CFTO]], i32* @__dfsan_retval_origin_tls, align 4
 
   ; NO_TRACK_CONTROL_FLOW: @"dfs$select8v"
   ; NO_TRACK_CONTROL_FLOW: [[FO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 2), align 4
   ; NO_TRACK_CONTROL_FLOW: [[TO:%.*]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
-  ; NO_TRACK_CONTROL_FLOW: [[FS:%.*]] = load i16, i16* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 4) to i16*), align 2
-  ; NO_TRACK_CONTROL_FLOW: [[FS_NE:%.*]] = icmp ne i16 [[FS]], 0
+  ; NO_TRACK_CONTROL_FLOW: [[FS:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 4) to i[[#SBITS]]*), align 2
+  ; NO_TRACK_CONTROL_FLOW: [[FS_NE:%.*]] = icmp ne i[[#SBITS]] [[FS]], 0
   ; NO_TRACK_CONTROL_FLOW: [[FTO:%.*]] = select i1 [[FS_NE]], i32 [[FO]], i32 [[TO]]
   ; NO_TRACK_CONTROL_FLOW: store i32 [[FTO]], i32* @__dfsan_retval_origin_tls, align 4
 

diff  --git a/llvm/test/Instrumentation/DataFlowSanitizer/origin_store.ll b/llvm/test/Instrumentation/DataFlowSanitizer/origin_store.ll
new file mode 100644
index 0000000000000..72d7c6e816456
--- /dev/null
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/origin_store.ll
@@ -0,0 +1,164 @@
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels -S | FileCheck %s
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels -dfsan-combine-pointer-labels-on-store -S | FileCheck %s --check-prefixes=CHECK,COMBINE_STORE_PTR
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels -S | FileCheck %s --check-prefixes=CHECK,CHECK16
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels -dfsan-combine-pointer-labels-on-store -S | FileCheck %s --check-prefixes=CHECK,CHECK16,COMBINE_STORE_PTR
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: @__dfsan_shadow_width_bits = weak_odr constant i32 [[#SBITS:]]
+; CHECK: @__dfsan_shadow_width_bytes = weak_odr constant i32 [[#SBYTES:]]
+
+define void @store_zero_to_non_escaped_alloca() {
+  ; CHECK-LABEL: @"dfs$store_zero_to_non_escaped_alloca"
+  ; CHECK-NEXT: [[A:%.*]] = alloca i[[#SBITS]], align [[#SBYTES]]
+  ; CHECK-NEXT: %_dfsa = alloca i32, align 4
+  ; CHECK-NEXT: %p = alloca i16, align 2
+  ; CHECK-NEXT: store i[[#SBITS]] 0, i[[#SBITS]]* [[A]], align [[#SBYTES]]
+  ; CHECK-NEXT: store i16 1, i16* %p, align 2
+  ; CHECK-NEXT: ret void
+  
+  %p = alloca i16
+  store i16 1, i16* %p
+  ret void
+}
+
+define void @store_nonzero_to_non_escaped_alloca(i16 %a) {
+  ; CHECK-LABEL: @"dfs$store_nonzero_to_non_escaped_alloca"
+  ; CHECK: %[[#AO:]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
+  ; CHECK: %_dfsa = alloca i32, align 4
+  ; CHECK: store i32 %[[#AO]], i32* %_dfsa, align 4
+  
+  %p = alloca i16
+  store i16 %a, i16* %p
+  ret void
+}
+
+declare void @foo(i16* %p)
+
+define void @store_zero_to_escaped_alloca() {
+  ; CHECK-LABEL: @"dfs$store_zero_to_escaped_alloca"
+  ; CHECK:       %[[#SA:]] = bitcast i[[#SBITS]]* {{.*}} to i[[#NUM_BITS:mul(SBITS,2)]]*
+  ; CHECK-NEXT:  store i[[#NUM_BITS]] 0, i[[#NUM_BITS]]* %[[#SA]], align [[#SBYTES]]
+  ; CHECK-NEXT:  store i16 1, i16* %p, align 2
+  ; CHECK-NEXT:  store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]]
+  ; CHECK-NEXT:  call void @"dfs$foo"(i16* %p)
+
+  %p = alloca i16
+  store i16 1, i16* %p
+  call void @foo(i16* %p)
+  ret void
+}
+
+define void @store_nonzero_to_escaped_alloca(i16 %a) {
+  ; CHECK-LABEL:  @"dfs$store_nonzero_to_escaped_alloca"
+  ; CHECK-NEXT:   %[[#AO:]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
+  ; CHECK-NEXT:   %[[#AS:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
+  ; CHECK:        %[[#INTP:]] = ptrtoint i16* %p to i64
+  ; CHECK-NEXT:   %[[#SHADOW_ADDR:]] = and i64 %[[#INTP]], [[#%.10d,MASK:]]
+  ; CHECK16-NEXT: %[[#SHADOW_ADDR:]] = mul i64 %[[#SHADOW_ADDR]], 2
+  ; CHECK-NEXT:   %[[#SHADOW_PTR0:]] = inttoptr i64 %[[#SHADOW_ADDR]] to i[[#SBITS]]*
+  ; CHECK-NEXT:   %[[#ORIGIN_OFFSET:]] = add i64 %[[#INTP+1]], [[#%.10d,ORIGIN_MASK:]]
+  ; CHECK-NEXT:   %[[#ORIGIN_ADDR:]] = and i64 %[[#ORIGIN_OFFSET]], -4
+  ; CHECK-NEXT:   %[[#ORIGIN_PTR:]] = inttoptr i64 %[[#ORIGIN_ADDR]] to i32*
+  ; CHECK:        %_dfscmp = icmp ne i[[#SBITS]] %[[#AS]], 0
+  ; CHECK-NEXT:   br i1 %_dfscmp, label %[[L1:.*]], label %[[L2:.*]],
+  ; CHECK:       [[L1]]:
+  ; CHECK-NEXT:   %[[#NO:]] = call i32 @__dfsan_chain_origin(i32 %[[#AO]])
+  ; CHECK-NEXT:   store i32 %[[#NO]], i32* %[[#ORIGIN_PTR]], align 4
+  ; CHECK-NEXT:   br label %[[L2]]
+  ; CHECK:       [[L2]]:
+  ; CHECK-NEXT:    store i16 %a, i16* %p, align 2
+  
+  %p = alloca i16
+  store i16 %a, i16* %p
+  call void @foo(i16* %p)
+  ret void
+}
+
+define void @store64_align8(i64* %p, i64 %a) {
+  ; CHECK-LABEL: @"dfs$store64_align8"
+
+  ; COMBINE_STORE_PTR-NEXT: %[[#PO:]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
+  ; COMBINE_STORE_PTR-NEXT: %[[#PS:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
+
+  ; CHECK-NEXT:  %[[#AO:]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
+  ; CHECK-NEXT:  %[[#AS:]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[ALIGN]]
+
+  ; COMBINE_STORE_PTR-NEXT: %[[#AS:]] = or i[[#SBITS]] %[[#AS]], %[[#PS]]
+  ; COMBINE_STORE_PTR-NEXT: %[[#NE:]] = icmp ne i[[#SBITS]] %[[#PS]], 0
+  ; COMBINE_STORE_PTR-NEXT: %[[#AO:]] = select i1 %[[#NE]], i32 %[[#PO]], i32 %[[#AO]]
+
+  ; CHECK:       %_dfscmp = icmp ne i[[#SBITS]] %[[#AS]], 0
+  ; CHECK-NEXT:  br i1 %_dfscmp, label %[[L1:.*]], label %[[L2:.*]],
+  ; CHECK:      [[L1]]:
+  ; CHECK-NEXT:  %[[#NO:]] = call i32 @__dfsan_chain_origin(i32 %[[#AO]])
+  ; CHECK-NEXT:  %[[#NO_ZEXT:]] = zext i32 %[[#NO]] to i64
+  ; CHECK-NEXT:  %[[#NO_SHL:]] = shl i64 %[[#NO_ZEXT]], 32
+  ; CHECK-NEXT:  %[[#NO2:]] = or i64 %[[#NO_ZEXT]], %[[#NO_SHL]]
+  ; CHECK-NEXT:  %[[#O_PTR:]] = bitcast i32* {{.*}} to i64*
+  ; CHECK-NEXT:  store i64 %[[#NO2]], i64* %[[#O_PTR]], align 8
+  ; CHECK-NEXT:  br label %[[L2]]
+  ; CHECK:      [[L2]]:
+  ; CHECK-NEXT:  store i64 %a, i64* %p, align 8
+  
+  store i64 %a, i64* %p
+  ret void
+}
+
+define void @store64_align2(i64* %p, i64 %a) {
+  ; CHECK-LABEL: @"dfs$store64_align2"
+
+  ; COMBINE_STORE_PTR-NEXT: %[[#PO:]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
+  ; COMBINE_STORE_PTR-NEXT: %[[#PS:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
+
+  ; CHECK-NEXT: %[[#AO:]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
+  ; CHECK-NEXT: %[[#AS:]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[ALIGN]]
+
+  ; COMBINE_STORE_PTR-NEXT: %[[#AS:]] = or i[[#SBITS]] %[[#AS]], %[[#PS]]
+  ; COMBINE_STORE_PTR-NEXT: %[[#NE:]] = icmp ne i[[#SBITS]] %[[#PS]], 0
+  ; COMBINE_STORE_PTR-NEXT: %[[#AO:]] = select i1 %[[#NE]], i32 %[[#PO]], i32 %[[#AO]]
+
+  ; CHECK:      %_dfscmp = icmp ne i[[#SBITS]] %[[#AS]], 0
+  ; CHECK-NEXT: br i1 %_dfscmp, label %[[L1:.*]], label %[[L2:.*]],
+  ; CHECK:     [[L1]]:
+  ; CHECK-NEXT: %[[#NO:]] = call i32 @__dfsan_chain_origin(i32 %[[#AO]])
+  ; CHECK-NEXT: store i32 %[[#NO]], i32* %[[#O_PTR0:]], align 4
+  ; CHECK-NEXT: %[[#O_PTR1:]] = getelementptr i32, i32* %[[#O_PTR0]], i32 1
+  ; CHECK-NEXT: store i32 %[[#NO]], i32* %[[#O_PTR1]], align 4
+  ; CHECK:     [[L2]]:
+  ; CHECK-NEXT: store i64 %a, i64* %p, align 2
+  
+  store i64 %a, i64* %p, align 2
+  ret void
+}
+
+define void @store96_align8(i96* %p, i96 %a) {
+  ; CHECK-LABEL: @"dfs$store96_align8"
+
+  ; COMBINE_STORE_PTR-NEXT: %[[#PO:]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 0), align 4
+  ; COMBINE_STORE_PTR-NEXT: %[[#PS:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]]
+
+  ; CHECK-NEXT: %[[#AO:]] = load i32, i32* getelementptr inbounds ([200 x i32], [200 x i32]* @__dfsan_arg_origin_tls, i64 0, i64 1), align 4
+  ; CHECK-NEXT: %[[#AS:]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([100 x i64]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[ALIGN]]
+
+  ; COMBINE_STORE_PTR-NEXT: %[[#AS:]] = or i[[#SBITS]] %[[#AS]], %[[#PS]]
+  ; COMBINE_STORE_PTR-NEXT: %[[#NE:]] = icmp ne i[[#SBITS]] %[[#PS]], 0
+  ; COMBINE_STORE_PTR-NEXT: %[[#AO:]] = select i1 %[[#NE]], i32 %[[#PO]], i32 %[[#AO]]
+
+  ; CHECK:      %_dfscmp = icmp ne i[[#SBITS]] %[[#AS]], 0
+  ; CHECK-NEXT: br i1 %_dfscmp, label %[[L1:.*]], label %[[L2:.*]],
+  ; CHECK:     [[L1]]:
+  ; CHECK-NEXT: %[[#NO:]] = call i32 @__dfsan_chain_origin(i32 %[[#AO]])
+  ; CHECK-NEXT: %[[#NO_ZEXT:]] = zext i32 %[[#NO]] to i64
+  ; CHECK-NEXT: %[[#NO_SHL:]] = shl i64 %[[#NO_ZEXT]], 32
+  ; CHECK-NEXT: %[[#NO2:]] = or i64 %[[#NO_ZEXT]], %[[#NO_SHL]]
+  ; CHECK-NEXT: %[[#O_PTR64:]] = bitcast i32* %[[#O_PTR0:]] to i64*
+  ; CHECK-NEXT: store i64 %[[#NO2]], i64* %[[#O_PTR64]], align 8
+  ; CHECK-NEXT: %[[#O_PTR1:]] = getelementptr i32, i32* %[[#O_PTR0]], i32 2
+  ; CHECK-NEXT: store i32 %[[#NO]], i32* %[[#O_PTR1]], align 8
+  ; CHECK:     [[L2]]:
+  ; CHECK-NEXT: store i96 %a, i96* %p, align 8
+  
+  store i96 %a, i96* %p, align 8
+  ret void
+}

diff  --git a/llvm/test/Instrumentation/DataFlowSanitizer/origin_store_threshold.ll b/llvm/test/Instrumentation/DataFlowSanitizer/origin_store_threshold.ll
index 3b3be8392cfea..ce1ec49e0f9fd 100644
--- a/llvm/test/Instrumentation/DataFlowSanitizer/origin_store_threshold.ll
+++ b/llvm/test/Instrumentation/DataFlowSanitizer/origin_store_threshold.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -dfsan-instrument-with-call-threshold=0 -S | FileCheck %s --check-prefix=CHECK
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels=true  -dfsan-instrument-with-call-threshold=0 -S | FileCheck %s
+; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -dfsan-instrument-with-call-threshold=0 -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 


        


More information about the llvm-commits mailing list