[llvm-branch-commits] [clang] [compiler-rt] [llvm] [TySan] A Type Sanitizer (Runtime Library) (PR #76261)

Fri Dec 6 04:09:36 PST 2024

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/76261

>From 49abcd207fe26ea0fc7170e66f1b0b22f1d853d3 Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke at amd.com>
Date: Fri, 6 Dec 2024 15:25:54 +0530
Subject: [PATCH 01/18] [CodeGen][PM] Initialize analyses with isAnalysis=true
 (#118779)

Analyses should be marked as analyses.

Otherwise they are prone to get ignored by the legacy analysis cache mechanism and get scheduled redundantly.
---
 llvm/lib/CodeGen/GCMetadata.cpp            | 2 +-
 llvm/lib/CodeGen/LiveDebugVariables.cpp    | 2 +-
 llvm/lib/CodeGen/LiveIntervals.cpp         | 2 +-
 llvm/lib/CodeGen/LiveRegMatrix.cpp         | 2 +-
 llvm/lib/CodeGen/LiveStacks.cpp            | 2 +-
 llvm/lib/CodeGen/VirtRegMap.cpp            | 2 +-
 llvm/lib/Transforms/Utils/LoopSimplify.cpp | 4 ++--
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/GCMetadata.cpp b/llvm/lib/CodeGen/GCMetadata.cpp
index 6d1cc1a58e27df..f33008c9e0f232 100644
--- a/llvm/lib/CodeGen/GCMetadata.cpp
+++ b/llvm/lib/CodeGen/GCMetadata.cpp
@@ -66,7 +66,7 @@ GCFunctionAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
 }
 
 INITIALIZE_PASS(GCModuleInfo, "collector-metadata",
-                "Create Garbage Collector Module Metadata", false, false)
+                "Create Garbage Collector Module Metadata", false, true)
 
 // -----------------------------------------------------------------------------
 
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 317d3401f000a4..79085e587ebc45 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -82,7 +82,7 @@ INITIALIZE_PASS_BEGIN(LiveDebugVariablesWrapperLegacy, DEBUG_TYPE,
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
 INITIALIZE_PASS_END(LiveDebugVariablesWrapperLegacy, DEBUG_TYPE,
-                    "Debug Variable Analysis", false, false)
+                    "Debug Variable Analysis", false, true)
 
 void LiveDebugVariablesWrapperLegacy::getAnalysisUsage(
     AnalysisUsage &AU) const {
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index f9ee6e4563f8d6..f38527a3ce6a31 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -83,7 +83,7 @@ INITIALIZE_PASS_BEGIN(LiveIntervalsWrapperPass, "liveintervals",
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
 INITIALIZE_PASS_END(LiveIntervalsWrapperPass, "liveintervals",
-                    "Live Interval Analysis", false, false)
+                    "Live Interval Analysis", false, true)
 
 bool LiveIntervalsWrapperPass::runOnMachineFunction(MachineFunction &MF) {
   LIS.Indexes = &getAnalysis<SlotIndexesWrapperPass>().getSI();
diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp
index bc8c59381a40e1..9744c47d5a8510 100644
--- a/llvm/lib/CodeGen/LiveRegMatrix.cpp
+++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -41,7 +41,7 @@ INITIALIZE_PASS_BEGIN(LiveRegMatrixWrapperLegacy, "liveregmatrix",
 INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
 INITIALIZE_PASS_END(LiveRegMatrixWrapperLegacy, "liveregmatrix",
-                    "Live Register Matrix", false, false)
+                    "Live Register Matrix", false, true)
 
 void LiveRegMatrixWrapperLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
diff --git a/llvm/lib/CodeGen/LiveStacks.cpp b/llvm/lib/CodeGen/LiveStacks.cpp
index 92cc6699f2d331..d615caf48c0ad3 100644
--- a/llvm/lib/CodeGen/LiveStacks.cpp
+++ b/llvm/lib/CodeGen/LiveStacks.cpp
@@ -25,7 +25,7 @@ INITIALIZE_PASS_BEGIN(LiveStacksWrapperLegacy, DEBUG_TYPE,
                       "Live Stack Slot Analysis", false, false)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
 INITIALIZE_PASS_END(LiveStacksWrapperLegacy, DEBUG_TYPE,
-                    "Live Stack Slot Analysis", false, false)
+                    "Live Stack Slot Analysis", false, true)
 
 char &llvm::LiveStacksID = LiveStacksWrapperLegacy::ID;
 
diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp
index 1352102a93d01b..b28c74600e7a29 100644
--- a/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -60,7 +60,7 @@ STATISTIC(NumIdCopies,   "Number of identity moves eliminated after rewriting");
 char VirtRegMapWrapperLegacy::ID = 0;
 
 INITIALIZE_PASS(VirtRegMapWrapperLegacy, "virtregmap", "Virtual Register Map",
-                false, false)
+                false, true)
 
 void VirtRegMap::init(MachineFunction &mf) {
   MRI = &mf.getRegInfo();
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 44fdfe530178a9..d8298646e18d7e 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -777,8 +777,8 @@ INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_END(LoopSimplify, "loop-simplify",
-                "Canonicalize natural loops", false, false)
+INITIALIZE_PASS_END(LoopSimplify, "loop-simplify", "Canonicalize natural loops",
+                    false, true)
 
 // Publicly exposed interface to pass...
 char &llvm::LoopSimplifyID = LoopSimplify::ID;

>From 82c93b6f19bf9db75ac6e4100b9f1b4f2a7df67f Mon Sep 17 00:00:00 2001
From: Akshay Deodhar <adeodhar at nvidia.com>
Date: Fri, 6 Dec 2024 15:26:45 +0530
Subject: [PATCH 02/18] [SCEV] Simplify SCEVExpr for PHI to SCEV for operand if
 operands are identical (#115945)

Helps SCEV analyze some special phi nodes, allowing the computation of
loop trip count in cases like the following:

https://godbolt.org/z/xGs1d81TW
---
 llvm/include/llvm/Analysis/ScalarEvolution.h  |  4 +
 llvm/lib/Analysis/ScalarEvolution.cpp         | 39 ++++++++
 .../trip-count-phi-increment.ll               | 92 +++++++++++++++++++
 3 files changed, 135 insertions(+)
 create mode 100644 llvm/test/Analysis/ScalarEvolution/trip-count-phi-increment.ll

diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index b20c6a13cb6bd7..de74524c4b6fe4 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1780,6 +1780,10 @@ class ScalarEvolution {
   /// V.
   const SCEV *getOperandsToCreate(Value *V, SmallVectorImpl<Value *> &Ops);
 
+  /// Returns SCEV for the first operand of a phi if all phi operands have
+  /// identical opcodes and operands.
+  const SCEV *createNodeForPHIWithIdenticalOperands(PHINode *PN);
+
   /// Provide the special handling we need to analyze PHI SCEVs.
   const SCEV *createNodeForPHI(PHINode *PN);
 
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 882e938e69c0c2..cad10486cbf3fa 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -6019,6 +6019,42 @@ const SCEV *ScalarEvolution::createNodeFromSelectLikePHI(PHINode *PN) {
   return nullptr;
 }
 
+/// Returns SCEV for the first operand of a phi if all phi operands have
+/// identical opcodes and operands
+/// eg.
+/// a: %add = %a + %b
+///    br %c
+/// b: %add1 = %a + %b
+///    br %c
+/// c: %phi = phi [%add, a], [%add1, b]
+/// scev(%phi) => scev(%add)
+const SCEV *
+ScalarEvolution::createNodeForPHIWithIdenticalOperands(PHINode *PN) {
+  BinaryOperator *CommonInst = nullptr;
+  // Check if instructions are identical.
+  for (Value *Incoming : PN->incoming_values()) {
+    auto *IncomingInst = dyn_cast<BinaryOperator>(Incoming);
+    if (!IncomingInst)
+      return nullptr;
+    if (CommonInst) {
+      if (!CommonInst->isIdenticalToWhenDefined(IncomingInst))
+        return nullptr; // Not identical, give up
+    } else {
+      // Remember binary operator
+      CommonInst = IncomingInst;
+    }
+  }
+  if (!CommonInst)
+    return nullptr;
+
+  // Check if SCEV exprs for instructions are identical.
+  const SCEV *CommonSCEV = getSCEV(CommonInst);
+  bool SCEVExprsIdentical =
+      all_of(drop_begin(PN->incoming_values()),
+             [this, CommonSCEV](Value *V) { return CommonSCEV == getSCEV(V); });
+  return SCEVExprsIdentical ? CommonSCEV : nullptr;
+}
+
 const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
   if (const SCEV *S = createAddRecFromPHI(PN))
     return S;
@@ -6030,6 +6066,9 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
                /*UseInstrInfo=*/true, /*CanUseUndef=*/false}))
     return getSCEV(V);
 
+  if (const SCEV *S = createNodeForPHIWithIdenticalOperands(PN))
+    return S;
+
   if (const SCEV *S = createNodeFromSelectLikePHI(PN))
     return S;
 
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-phi-increment.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-phi-increment.ll
new file mode 100644
index 00000000000000..db284b263d068c
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/trip-count-phi-increment.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -disable-output "-passes=print<scalar-evolution>" 2>&1 | FileCheck %s
+define void @test1(ptr %x, ptr %y) {
+; CHECK-LABEL: 'test1'
+; CHECK-NEXT:  Classifying expressions for: @test1
+; CHECK-NEXT:    %v1.0 = phi i32 [ 0, %entry ], [ %k.0, %if.end ]
+; CHECK-NEXT:    --> {0,+,1}<nuw><nsw><%for.cond> U: [0,7) S: [0,7) Exits: 6 LoopDispositions: { %for.cond: Computable }
+; CHECK-NEXT:    %add = add nsw i32 %v1.0, 1
+; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%for.cond> U: [1,8) S: [1,8) Exits: 7 LoopDispositions: { %for.cond: Computable }
+; CHECK-NEXT:    %add6 = add nsw i32 %v1.0, 1
+; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%for.cond> U: [1,8) S: [1,8) Exits: 7 LoopDispositions: { %for.cond: Computable }
+; CHECK-NEXT:    %k.0 = phi i32 [ %add, %if.then ], [ %add6, %if.else ]
+; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%for.cond> U: [1,8) S: [1,8) Exits: 7 LoopDispositions: { %for.cond: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test1
+; CHECK-NEXT:  Loop %for.cond: backedge-taken count is i32 6
+; CHECK-NEXT:  Loop %for.cond: constant max backedge-taken count is i32 6
+; CHECK-NEXT:  Loop %for.cond: symbolic max backedge-taken count is i32 6
+; CHECK-NEXT:  Loop %for.cond: Trip multiple is 7
+;
+entry:
+  br label %for.cond
+
+for.cond:                                                ; preds = %6, %0
+  %v1.0 = phi i32 [ 0, %entry ], [ %k.0, %if.end ]
+  %cmp = icmp slt i32 %v1.0, 6
+  br i1 %cmp, label %for.body, label %exit
+
+for.body:                                                ; preds = %1
+  %cmp3 = icmp slt i32 %v1.0, 2
+  br i1 %cmp3, label %if.then, label %if.else
+
+if.then:                                                ; preds = %2
+  %add = add nsw i32 %v1.0, 1
+  br label %if.end
+
+if.else:                                                ; preds = %2
+  %add6 = add nsw i32 %v1.0, 1
+  br label %if.end
+
+if.end:                                                ; preds = %4, %3
+  %k.0 = phi i32 [ %add, %if.then ], [ %add6, %if.else ]
+  br label %for.cond
+
+exit:                                                ; preds = %5
+  ret void
+}
+
+define void @test2(ptr %x, ptr %y) {
+; CHECK-LABEL: 'test2'
+; CHECK-NEXT:  Classifying expressions for: @test2
+; CHECK-NEXT:    %v1.0 = phi i32 [ 0, %entry ], [ %k.0, %if.end ]
+; CHECK-NEXT:    --> {0,+,1}<nuw><nsw><%for.cond> U: [0,7) S: [0,7) Exits: 6 LoopDispositions: { %for.cond: Computable }
+; CHECK-NEXT:    %add = add nuw i32 %v1.0, 1
+; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%for.cond> U: [1,8) S: [1,8) Exits: 7 LoopDispositions: { %for.cond: Computable }
+; CHECK-NEXT:    %add6 = add nsw i32 %v1.0, 1
+; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%for.cond> U: [1,8) S: [1,8) Exits: 7 LoopDispositions: { %for.cond: Computable }
+; CHECK-NEXT:    %k.0 = phi i32 [ %add, %if.then ], [ %add6, %if.else ]
+; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%for.cond> U: [1,8) S: [1,8) Exits: 7 LoopDispositions: { %for.cond: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test2
+; CHECK-NEXT:  Loop %for.cond: backedge-taken count is i32 6
+; CHECK-NEXT:  Loop %for.cond: constant max backedge-taken count is i32 6
+; CHECK-NEXT:  Loop %for.cond: symbolic max backedge-taken count is i32 6
+; CHECK-NEXT:  Loop %for.cond: Trip multiple is 7
+;
+entry:
+  br label %for.cond
+
+for.cond:                                                ; preds = %6, %0
+  %v1.0 = phi i32 [ 0, %entry ], [ %k.0, %if.end ]
+  %cmp = icmp slt i32 %v1.0, 6
+  br i1 %cmp, label %for.body, label %exit
+
+for.body:                                                ; preds = %1
+  %cmp3 = icmp slt i32 %v1.0, 2
+  br i1 %cmp3, label %if.then, label %if.else
+
+if.then:                                                ; preds = %2
+  %add = add nuw i32 %v1.0, 1
+  br label %if.end
+
+if.else:                                                ; preds = %2
+  %add6 = add nsw i32 %v1.0, 1
+  br label %if.end
+
+if.end:                                                ; preds = %4, %3
+  %k.0 = phi i32 [ %add, %if.then ], [ %add6, %if.else ]
+  br label %for.cond
+
+exit:                                                ; preds = %5
+  ret void
+}
+

>From 3dbff90b16b5964b9fa468438ff40985be5c1ade Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 5 Dec 2024 15:38:09 +0000
Subject: [PATCH 03/18] [X86] matchPMADDWD/matchPMADDWD_2 - update to use
 SDPatternMatch matching. NFCI.

Prep work for #118433
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 103 ++++++++++--------------
 1 file changed, 41 insertions(+), 62 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c18a4ac9acb1e4..f713f2ed209e1c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -56447,9 +56447,11 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
+static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,
                             const SDLoc &DL, EVT VT,
                             const X86Subtarget &Subtarget) {
+  using namespace SDPatternMatch;
+
   // Example of pattern we try to detect:
   // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
   //(add (build_vector (extract_elt t, 0),
@@ -56464,15 +56466,16 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
   if (!Subtarget.hasSSE2())
     return SDValue();
 
-  if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
-      Op1.getOpcode() != ISD::BUILD_VECTOR)
-    return SDValue();
-
   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
       VT.getVectorNumElements() < 4 ||
       !isPowerOf2_32(VT.getVectorNumElements()))
     return SDValue();
 
+  SDValue Op0, Op1;
+  if (!sd_match(N, m_Add(m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op0)),
+                         m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op1)))))
+    return SDValue();
+
   // Check if one of Op0,Op1 is of the form:
   // (build_vector (extract_elt Mul, 0),
   //               (extract_elt Mul, 2),
@@ -56489,26 +56492,23 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
     SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
             Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
     // TODO: Be more tolerant to undefs.
-    if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
-      return SDValue();
-    auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
-    auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
-    auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
-    auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
-    if (!Const0L || !Const1L || !Const0H || !Const1H)
+    APInt Idx0L, Idx0H, Idx1L, Idx1H;
+    if (!sd_match(Op0L, m_BinOp(ISD::EXTRACT_VECTOR_ELT, m_Value(),
+                                m_ConstInt(Idx0L))) ||
+        !sd_match(Op0H, m_BinOp(ISD::EXTRACT_VECTOR_ELT, m_Value(),
+                                m_ConstInt(Idx0H))) ||
+        !sd_match(Op1L, m_BinOp(ISD::EXTRACT_VECTOR_ELT, m_Value(),
+                                m_ConstInt(Idx1L))) ||
+        !sd_match(Op1H, m_BinOp(ISD::EXTRACT_VECTOR_ELT, m_Value(),
+                                m_ConstInt(Idx1H))))
       return SDValue();
-    unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
-             Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
     // Commutativity of mul allows factors of a product to reorder.
-    if (Idx0L > Idx1L)
+    if (Idx0L.getZExtValue() > Idx1L.getZExtValue())
       std::swap(Idx0L, Idx1L);
-    if (Idx0H > Idx1H)
+    if (Idx0H.getZExtValue() > Idx1H.getZExtValue())
       std::swap(Idx0H, Idx1H);
     // Commutativity of add allows pairs of factors to reorder.
-    if (Idx0L > Idx0H) {
+    if (Idx0L.getZExtValue() > Idx0H.getZExtValue()) {
       std::swap(Idx0L, Idx0H);
       std::swap(Idx1L, Idx1H);
     }
@@ -56555,13 +56555,12 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
 // Attempt to turn this pattern into PMADDWD.
 // (add (mul (sext (build_vector)), (sext (build_vector))),
 //      (mul (sext (build_vector)), (sext (build_vector)))
-static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
+static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDNode *N,
                               const SDLoc &DL, EVT VT,
                               const X86Subtarget &Subtarget) {
-  if (!Subtarget.hasSSE2())
-    return SDValue();
+  using namespace SDPatternMatch;
 
-  if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
+  if (!Subtarget.hasSSE2())
     return SDValue();
 
   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
@@ -56569,25 +56568,13 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
       !isPowerOf2_32(VT.getVectorNumElements()))
     return SDValue();
 
-  SDValue N00 = N0.getOperand(0);
-  SDValue N01 = N0.getOperand(1);
-  SDValue N10 = N1.getOperand(0);
-  SDValue N11 = N1.getOperand(1);
-
   // All inputs need to be sign extends.
   // TODO: Support ZERO_EXTEND from known positive?
-  if (N00.getOpcode() != ISD::SIGN_EXTEND ||
-      N01.getOpcode() != ISD::SIGN_EXTEND ||
-      N10.getOpcode() != ISD::SIGN_EXTEND ||
-      N11.getOpcode() != ISD::SIGN_EXTEND)
+  SDValue N00, N01, N10, N11;
+  if (!sd_match(N, m_Add(m_Mul(m_SExt(m_Value(N00)), m_SExt(m_Value(N01))),
+                         m_Mul(m_SExt(m_Value(N10)), m_SExt(m_Value(N11))))))
     return SDValue();
 
-  // Peek through the extends.
-  N00 = N00.getOperand(0);
-  N01 = N01.getOperand(0);
-  N10 = N10.getOperand(0);
-  N11 = N11.getOperand(0);
-
   // Must be extending from vXi16.
   EVT InVT = N00.getValueType();
   if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
@@ -56614,34 +56601,26 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
     SDValue N10Elt = N10.getOperand(i);
     SDValue N11Elt = N11.getOperand(i);
     // TODO: Be more tolerant to undefs.
-    if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
-      return SDValue();
-    auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
-    auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
-    auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
-    auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
-    if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
+    SDValue N00In, N01In, N10In, N11In;
+    APInt IdxN00, IdxN01, IdxN10, IdxN11;
+    if (!sd_match(N00Elt, m_BinOp(ISD::EXTRACT_VECTOR_ELT, m_Value(N00In),
+                                  m_ConstInt(IdxN00))) ||
+        !sd_match(N01Elt, m_BinOp(ISD::EXTRACT_VECTOR_ELT, m_Value(N01In),
+                                  m_ConstInt(IdxN01))) ||
+        !sd_match(N10Elt, m_BinOp(ISD::EXTRACT_VECTOR_ELT, m_Value(N10In),
+                                  m_ConstInt(IdxN10))) ||
+        !sd_match(N11Elt, m_BinOp(ISD::EXTRACT_VECTOR_ELT, m_Value(N11In),
+                                  m_ConstInt(IdxN11))))
       return SDValue();
-    unsigned IdxN00 = ConstN00Elt->getZExtValue();
-    unsigned IdxN01 = ConstN01Elt->getZExtValue();
-    unsigned IdxN10 = ConstN10Elt->getZExtValue();
-    unsigned IdxN11 = ConstN11Elt->getZExtValue();
     // Add is commutative so indices can be reordered.
-    if (IdxN00 > IdxN10) {
+    if (IdxN00.getZExtValue() > IdxN10.getZExtValue()) {
       std::swap(IdxN00, IdxN10);
       std::swap(IdxN01, IdxN11);
     }
     // N0 indices be the even element. N1 indices must be the next odd element.
-    if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
-        IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
+    if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || IdxN01 != 2 * i ||
+        IdxN11 != 2 * i + 1)
       return SDValue();
-    SDValue N00In = N00Elt.getOperand(0);
-    SDValue N01In = N01Elt.getOperand(0);
-    SDValue N10In = N10Elt.getOperand(0);
-    SDValue N11In = N11Elt.getOperand(0);
 
     // First time we find an input capture it.
     if (!In0) {
@@ -56815,9 +56794,9 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
   if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
     return Select;
 
-  if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
+  if (SDValue MAdd = matchPMADDWD(DAG, N, DL, VT, Subtarget))
     return MAdd;
-  if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
+  if (SDValue MAdd = matchPMADDWD_2(DAG, N, DL, VT, Subtarget))
     return MAdd;
   if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
     return MAdd;

>From 1126bef609e7afa77105308406d74d4e459ee0a5 Mon Sep 17 00:00:00 2001
From: James Chesterman <James.Chesterman at arm.com>
Date: Fri, 6 Dec 2024 10:56:57 +0000
Subject: [PATCH 04/18] [AArch64][SVE] Only generate wide adds when SVE2 or
 StreamingSVE is available (#118838)

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   2 +-
 .../AArch64/sve-partial-reduce-wide-add.ll    | 111 +++++++++++++-----
 2 files changed, 81 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e61dedb2477560..d1354ccf376609 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21817,7 +21817,7 @@ SDValue tryLowerPartialReductionToWideAdd(SDNode *N,
              Intrinsic::experimental_vector_partial_reduce_add &&
          "Expected a partial reduction node");
 
-  if (!Subtarget->isSVEorStreamingSVEAvailable())
+  if (!Subtarget->hasSVE2() && !Subtarget->isStreamingSVEAvailable())
     return SDValue();
 
   SDLoc DL(N);
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
index 1d05649964670d..b4b946c68566ed 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-wide-add.ll
@@ -1,12 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s
+; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE2
+; RUN: llc -mtriple=aarch64 -mattr=+sve %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE
 
 define <vscale x 2 x i64> @signed_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vscale x 4 x i32> %input){
-; CHECK-LABEL: signed_wide_add_nxv4i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddwb z0.d, z0.d, z1.s
-; CHECK-NEXT:    saddwt z0.d, z0.d, z1.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv4i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    saddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE-LABEL: signed_wide_add_nxv4i32:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    sunpklo z2.d, z1.s
+; CHECK-SVE-NEXT:    sunpkhi z1.d, z1.s
+; CHECK-SVE-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-SVE-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 4 x i32> %input to <vscale x 4 x i64>
     %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide)
@@ -14,11 +23,19 @@ entry:
 }
 
 define <vscale x 2 x i64> @unsigned_wide_add_nxv4i32(<vscale x 2 x i64> %acc, <vscale x 4 x i32> %input){
-; CHECK-LABEL: unsigned_wide_add_nxv4i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddwb z0.d, z0.d, z1.s
-; CHECK-NEXT:    uaddwt z0.d, z0.d, z1.s
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv4i32:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    uaddwt z0.d, z0.d, z1.s
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE-LABEL: unsigned_wide_add_nxv4i32:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    uunpklo z2.d, z1.s
+; CHECK-SVE-NEXT:    uunpkhi z1.d, z1.s
+; CHECK-SVE-NEXT:    add z0.d, z0.d, z2.d
+; CHECK-SVE-NEXT:    add z0.d, z1.d, z0.d
+; CHECK-SVE-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 4 x i32> %input to <vscale x 4 x i64>
     %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %input.wide)
@@ -26,11 +43,19 @@ entry:
 }
 
 define <vscale x 4 x i32> @signed_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %input){
-; CHECK-LABEL: signed_wide_add_nxv8i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddwb z0.s, z0.s, z1.h
-; CHECK-NEXT:    saddwt z0.s, z0.s, z1.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv8i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    saddwt z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE-LABEL: signed_wide_add_nxv8i16:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    sunpklo z2.s, z1.h
+; CHECK-SVE-NEXT:    sunpkhi z1.s, z1.h
+; CHECK-SVE-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-SVE-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 8 x i16> %input to <vscale x 8 x i32>
     %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide)
@@ -38,11 +63,19 @@ entry:
 }
 
 define <vscale x 4 x i32> @unsigned_wide_add_nxv8i16(<vscale x 4 x i32> %acc, <vscale x 8 x i16> %input){
-; CHECK-LABEL: unsigned_wide_add_nxv8i16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddwb z0.s, z0.s, z1.h
-; CHECK-NEXT:    uaddwt z0.s, z0.s, z1.h
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv8i16:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    uaddwt z0.s, z0.s, z1.h
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE-LABEL: unsigned_wide_add_nxv8i16:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    uunpklo z2.s, z1.h
+; CHECK-SVE-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-SVE-NEXT:    add z0.s, z0.s, z2.s
+; CHECK-SVE-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-SVE-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 8 x i16> %input to <vscale x 8 x i32>
     %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %input.wide)
@@ -50,11 +83,19 @@ entry:
 }
 
 define <vscale x 8 x i16> @signed_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %input){
-; CHECK-LABEL: signed_wide_add_nxv16i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddwb z0.h, z0.h, z1.b
-; CHECK-NEXT:    saddwt z0.h, z0.h, z1.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: signed_wide_add_nxv16i8:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    saddwb z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    saddwt z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE-LABEL: signed_wide_add_nxv16i8:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    sunpklo z2.h, z1.b
+; CHECK-SVE-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-SVE-NEXT:    add z0.h, z0.h, z2.h
+; CHECK-SVE-NEXT:    add z0.h, z1.h, z0.h
+; CHECK-SVE-NEXT:    ret
 entry:
     %input.wide = sext <vscale x 16 x i8> %input to <vscale x 16 x i16>
     %partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide)
@@ -62,11 +103,19 @@ entry:
 }
 
 define <vscale x 8 x i16> @unsigned_wide_add_nxv16i8(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %input){
-; CHECK-LABEL: unsigned_wide_add_nxv16i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddwb z0.h, z0.h, z1.b
-; CHECK-NEXT:    uaddwt z0.h, z0.h, z1.b
-; CHECK-NEXT:    ret
+; CHECK-SVE2-LABEL: unsigned_wide_add_nxv16i8:
+; CHECK-SVE2:       // %bb.0: // %entry
+; CHECK-SVE2-NEXT:    uaddwb z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    uaddwt z0.h, z0.h, z1.b
+; CHECK-SVE2-NEXT:    ret
+;
+; CHECK-SVE-LABEL: unsigned_wide_add_nxv16i8:
+; CHECK-SVE:       // %bb.0: // %entry
+; CHECK-SVE-NEXT:    uunpklo z2.h, z1.b
+; CHECK-SVE-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-SVE-NEXT:    add z0.h, z0.h, z2.h
+; CHECK-SVE-NEXT:    add z0.h, z1.h, z0.h
+; CHECK-SVE-NEXT:    ret
 entry:
     %input.wide = zext <vscale x 16 x i8> %input to <vscale x 16 x i16>
     %partial.reduce = tail call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i16(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %input.wide)

>From 2a4c74cc8451f0ba34baaee203afb941a498f6b3 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Fri, 6 Dec 2024 11:08:55 +0000
Subject: [PATCH 05/18] [AArch64] Update the scheduling model for
 Cortex-X1/2/3/4 (#118826)

These Neoverse-V scheduling models more closely match the Cortex-X
series cpus with 4 vector pipelines, even if they do not match exactly.
---
 llvm/lib/Target/AArch64/AArch64Processors.td  | 10 ++--
 .../AArch64/Cortex/X1-neon-instructions.s     | 45 ++++++++++++++++++
 .../AArch64/Cortex/X2-sve-instructions.s      | 45 ++++++++++--------
 .../AArch64/Cortex/X3-sve-instructions.s      | 47 +++++++++++++++++++
 .../AArch64/Cortex/X4-sve-instructions.s      | 47 +++++++++++++++++++
 5 files changed, 170 insertions(+), 24 deletions(-)
 create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/X1-neon-instructions.s
 create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/X3-sve-instructions.s
 create mode 100644 llvm/test/tools/llvm-mca/AArch64/Cortex/X4-sve-instructions.s

diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 6886df5392565d..af9554085cacde 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -1113,15 +1113,15 @@ def : ProcessorModel<"cortex-r82", CortexA55Model, ProcessorFeatures.R82,
                      [TuneR82]>;
 def : ProcessorModel<"cortex-r82ae", CortexA55Model, ProcessorFeatures.R82AE,
                      [TuneR82AE]>;
-def : ProcessorModel<"cortex-x1", CortexA57Model, ProcessorFeatures.X1,
+def : ProcessorModel<"cortex-x1", NeoverseV1Model, ProcessorFeatures.X1,
                      [TuneX1]>;
-def : ProcessorModel<"cortex-x1c", CortexA57Model, ProcessorFeatures.X1C,
+def : ProcessorModel<"cortex-x1c", NeoverseV1Model, ProcessorFeatures.X1C,
                      [TuneX1]>;
-def : ProcessorModel<"cortex-x2", NeoverseN2Model, ProcessorFeatures.X2,
+def : ProcessorModel<"cortex-x2", NeoverseV2Model, ProcessorFeatures.X2,
                      [TuneX2]>;
-def : ProcessorModel<"cortex-x3", NeoverseN2Model, ProcessorFeatures.X3,
+def : ProcessorModel<"cortex-x3", NeoverseV2Model, ProcessorFeatures.X3,
                      [TuneX3]>;
-def : ProcessorModel<"cortex-x4", NeoverseN2Model, ProcessorFeatures.X4,
+def : ProcessorModel<"cortex-x4", NeoverseV2Model, ProcessorFeatures.X4,
                      [TuneX4]>;
 def : ProcessorModel<"cortex-x925", NeoverseV2Model, ProcessorFeatures.X925,
                      [TuneX925]>;
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/X1-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/X1-neon-instructions.s
new file mode 100644
index 00000000000000..dc1bb486aeef7d
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/X1-neon-instructions.s
@@ -0,0 +1,45 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-x1 -instruction-tables < %s | FileCheck %s
+
+# Check the Neoverse V1 model is used.
+
+add	v0.16b, v1.16b, v31.16b
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     0.25                        add	v0.16b, v1.16b, v31.16b
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V1UnitB
+# CHECK-NEXT: [0.1] - V1UnitB
+# CHECK-NEXT: [1.0] - V1UnitD
+# CHECK-NEXT: [1.1] - V1UnitD
+# CHECK-NEXT: [2.0] - V1UnitFlg
+# CHECK-NEXT: [2.1] - V1UnitFlg
+# CHECK-NEXT: [2.2] - V1UnitFlg
+# CHECK-NEXT: [3]   - V1UnitL2
+# CHECK-NEXT: [4.0] - V1UnitL01
+# CHECK-NEXT: [4.1] - V1UnitL01
+# CHECK-NEXT: [5]   - V1UnitM0
+# CHECK-NEXT: [6]   - V1UnitM1
+# CHECK-NEXT: [7.0] - V1UnitS
+# CHECK-NEXT: [7.1] - V1UnitS
+# CHECK-NEXT: [8]   - V1UnitV0
+# CHECK-NEXT: [9]   - V1UnitV1
+# CHECK-NEXT: [10]  - V1UnitV2
+# CHECK-NEXT: [11]  - V1UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7.0]  [7.1]  [8]    [9]    [10]   [11]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7.0]  [7.1]  [8]    [9]    [10]   [11]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   add	v0.16b, v1.16b, v31.16b
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/X2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/X2-sve-instructions.s
index 2912ea35f1ee88..6497860ecfbacb 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Cortex/X2-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/X2-sve-instructions.s
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
 # RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-x2 -instruction-tables < %s | FileCheck %s
 
-# Check the Neoverse N2 model is used.
+# Check the Neoverse V2 model is used.
 
 addhnb	z0.b, z1.h, z31.h
 
@@ -14,27 +14,34 @@ addhnb	z0.b, z1.h, z31.h
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      2     0.50                        addhnb	z0.b, z1.h, z31.h
+# CHECK-NEXT:  1      2     0.25                        addhnb	z0.b, z1.h, z31.h
 
 # CHECK:      Resources:
-# CHECK-NEXT: [0.0] - N2UnitB
-# CHECK-NEXT: [0.1] - N2UnitB
-# CHECK-NEXT: [1.0] - N2UnitD
-# CHECK-NEXT: [1.1] - N2UnitD
-# CHECK-NEXT: [2]   - N2UnitL2
-# CHECK-NEXT: [3.0] - N2UnitL01
-# CHECK-NEXT: [3.1] - N2UnitL01
-# CHECK-NEXT: [4]   - N2UnitM0
-# CHECK-NEXT: [5]   - N2UnitM1
-# CHECK-NEXT: [6.0] - N2UnitS
-# CHECK-NEXT: [6.1] - N2UnitS
-# CHECK-NEXT: [7]   - N2UnitV0
-# CHECK-NEXT: [8]   - N2UnitV1
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2.0] - V2UnitFlg
+# CHECK-NEXT: [2.1] - V2UnitFlg
+# CHECK-NEXT: [2.2] - V2UnitFlg
+# CHECK-NEXT: [3]   - V2UnitL2
+# CHECK-NEXT: [4.0] - V2UnitL01
+# CHECK-NEXT: [4.1] - V2UnitL01
+# CHECK-NEXT: [5]   - V2UnitM0
+# CHECK-NEXT: [6]   - V2UnitM1
+# CHECK-NEXT: [7]   - V2UnitS0
+# CHECK-NEXT: [8]   - V2UnitS1
+# CHECK-NEXT: [9]   - V2UnitS2
+# CHECK-NEXT: [10]  - V2UnitS3
+# CHECK-NEXT: [11]  - V2UnitV0
+# CHECK-NEXT: [12]  - V2UnitV1
+# CHECK-NEXT: [13]  - V2UnitV2
+# CHECK-NEXT: [14]  - V2UnitV3
 
 # CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25
 
 # CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2]    [3.0]  [3.1]  [4]    [5]    [6.0]  [6.1]  [7]    [8]    Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     0.50   0.50   addhnb	z0.b, z1.h, z31.h
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   addhnb	z0.b, z1.h, z31.h
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/X3-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/X3-sve-instructions.s
new file mode 100644
index 00000000000000..042e621f9a03d6
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/X3-sve-instructions.s
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-x3 -instruction-tables < %s | FileCheck %s
+
+# Check the Neoverse V2 model is used.
+
+addhnb	z0.b, z1.h, z31.h
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     0.25                        addhnb	z0.b, z1.h, z31.h
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2.0] - V2UnitFlg
+# CHECK-NEXT: [2.1] - V2UnitFlg
+# CHECK-NEXT: [2.2] - V2UnitFlg
+# CHECK-NEXT: [3]   - V2UnitL2
+# CHECK-NEXT: [4.0] - V2UnitL01
+# CHECK-NEXT: [4.1] - V2UnitL01
+# CHECK-NEXT: [5]   - V2UnitM0
+# CHECK-NEXT: [6]   - V2UnitM1
+# CHECK-NEXT: [7]   - V2UnitS0
+# CHECK-NEXT: [8]   - V2UnitS1
+# CHECK-NEXT: [9]   - V2UnitS2
+# CHECK-NEXT: [10]  - V2UnitS3
+# CHECK-NEXT: [11]  - V2UnitV0
+# CHECK-NEXT: [12]  - V2UnitV1
+# CHECK-NEXT: [13]  - V2UnitV2
+# CHECK-NEXT: [14]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   addhnb	z0.b, z1.h, z31.h
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/X4-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/X4-sve-instructions.s
new file mode 100644
index 00000000000000..19fba62ea30c6b
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/X4-sve-instructions.s
@@ -0,0 +1,47 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-x4 -instruction-tables < %s | FileCheck %s
+
+# Check the Neoverse V2 model is used.
+
+addhnb	z0.b, z1.h, z31.h
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     0.25                        addhnb	z0.b, z1.h, z31.h
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - V2UnitB
+# CHECK-NEXT: [0.1] - V2UnitB
+# CHECK-NEXT: [1.0] - V2UnitD
+# CHECK-NEXT: [1.1] - V2UnitD
+# CHECK-NEXT: [2.0] - V2UnitFlg
+# CHECK-NEXT: [2.1] - V2UnitFlg
+# CHECK-NEXT: [2.2] - V2UnitFlg
+# CHECK-NEXT: [3]   - V2UnitL2
+# CHECK-NEXT: [4.0] - V2UnitL01
+# CHECK-NEXT: [4.1] - V2UnitL01
+# CHECK-NEXT: [5]   - V2UnitM0
+# CHECK-NEXT: [6]   - V2UnitM1
+# CHECK-NEXT: [7]   - V2UnitS0
+# CHECK-NEXT: [8]   - V2UnitS1
+# CHECK-NEXT: [9]   - V2UnitS2
+# CHECK-NEXT: [10]  - V2UnitS3
+# CHECK-NEXT: [11]  - V2UnitV0
+# CHECK-NEXT: [12]  - V2UnitV1
+# CHECK-NEXT: [13]  - V2UnitV2
+# CHECK-NEXT: [14]  - V2UnitV3
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   addhnb	z0.b, z1.h, z31.h

>From 9a24f2198ec02960c9e9afedace96ba6afa9c5b1 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Thu, 5 Dec 2024 18:01:44 +0100
Subject: [PATCH 06/18] [MergeFuncs] Handle ConstantRangeList attributes

Support comparison of ConstantRangeList attributes in
FunctionComparator.
---
 .../Transforms/Utils/FunctionComparator.h     |  1 +
 .../Transforms/Utils/FunctionComparator.cpp   | 29 +++++++---
 .../Transforms/MergeFunc/initializes-attr.ll  | 53 +++++++++++++++++++
 3 files changed, 76 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/Transforms/MergeFunc/initializes-attr.ll

diff --git a/llvm/include/llvm/Transforms/Utils/FunctionComparator.h b/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
index c28f868039a1f7..19c5f7449f23ee 100644
--- a/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
+++ b/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
@@ -317,6 +317,7 @@ class FunctionComparator {
   int cmpNumbers(uint64_t L, uint64_t R) const;
   int cmpAligns(Align L, Align R) const;
   int cmpAPInts(const APInt &L, const APInt &R) const;
+  int cmpConstantRanges(const ConstantRange &L, const ConstantRange &R) const;
   int cmpAPFloats(const APFloat &L, const APFloat &R) const;
   int cmpMem(StringRef L, StringRef R) const;
 
diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
index 760341a29d8c8a..6d4026e8209de2 100644
--- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -83,6 +83,13 @@ int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const {
   return 0;
 }
 
+int FunctionComparator::cmpConstantRanges(const ConstantRange &L,
+                                          const ConstantRange &R) const {
+  if (int Res = cmpAPInts(L.getLower(), R.getLower()))
+    return Res;
+  return cmpAPInts(L.getUpper(), R.getUpper());
+}
+
 int FunctionComparator::cmpAPFloats(const APFloat &L, const APFloat &R) const {
   // Floats are ordered first by semantics (i.e. float, double, half, etc.),
   // then by value interpreted as a bitstring (aka APInt).
@@ -147,12 +154,22 @@ int FunctionComparator::cmpAttrs(const AttributeList L,
         if (LA.getKindAsEnum() != RA.getKindAsEnum())
           return cmpNumbers(LA.getKindAsEnum(), RA.getKindAsEnum());
 
-        const ConstantRange &LCR = LA.getRange();
-        const ConstantRange &RCR = RA.getRange();
-        if (int Res = cmpAPInts(LCR.getLower(), RCR.getLower()))
+        if (int Res = cmpConstantRanges(LA.getRange(), RA.getRange()))
           return Res;
-        if (int Res = cmpAPInts(LCR.getUpper(), RCR.getUpper()))
+        continue;
+      } else if (LA.isConstantRangeListAttribute() &&
+                 RA.isConstantRangeListAttribute()) {
+        if (LA.getKindAsEnum() != RA.getKindAsEnum())
+          return cmpNumbers(LA.getKindAsEnum(), RA.getKindAsEnum());
+
+        ArrayRef<ConstantRange> CRL = LA.getValueAsConstantRangeList();
+        ArrayRef<ConstantRange> CRR = RA.getValueAsConstantRangeList();
+        if (int Res = cmpNumbers(CRL.size(), CRR.size()))
           return Res;
+
+        for (const auto &[L, R] : zip(CRL, CRR))
+          if (int Res = cmpConstantRanges(L, R))
+            return Res;
         continue;
       }
       if (LA < RA)
@@ -441,9 +458,7 @@ int FunctionComparator::cmpConstants(const Constant *L,
       if (InRangeL) {
         if (!InRangeR)
           return 1;
-        if (int Res = cmpAPInts(InRangeL->getLower(), InRangeR->getLower()))
-          return Res;
-        if (int Res = cmpAPInts(InRangeL->getUpper(), InRangeR->getUpper()))
+        if (int Res = cmpConstantRanges(*InRangeL, *InRangeR))
           return Res;
       } else if (InRangeR) {
         return -1;
diff --git a/llvm/test/Transforms/MergeFunc/initializes-attr.ll b/llvm/test/Transforms/MergeFunc/initializes-attr.ll
new file mode 100644
index 00000000000000..0bd4fe2e474dd2
--- /dev/null
+++ b/llvm/test/Transforms/MergeFunc/initializes-attr.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=mergefunc < %s | FileCheck %s
+
+define internal void @test1(ptr initializes((0, 1)) %p) {
+; CHECK-LABEL: define internal void @test1(
+; CHECK-SAME: ptr initializes((0, 1)) [[P:%.*]]) {
+; CHECK-NEXT:    store i16 0, ptr [[P]], align 2
+; CHECK-NEXT:    ret void
+;
+  store i16 0, ptr %p
+  ret void
+}
+
+define internal void @test2(ptr initializes((0, 1)) %p) {
+  store i16 0, ptr %p
+  ret void
+}
+
+define internal void @test3(ptr initializes((0, 2)) %p) {
+; CHECK-LABEL: define internal void @test3(
+; CHECK-SAME: ptr initializes((0, 2)) [[P:%.*]]) {
+; CHECK-NEXT:    store i16 0, ptr [[P]], align 2
+; CHECK-NEXT:    ret void
+;
+  store i16 0, ptr %p
+  ret void
+}
+
+define internal void @test4(ptr initializes((0, 1), (2, 3)) %p) {
+; CHECK-LABEL: define internal void @test4(
+; CHECK-SAME: ptr initializes((0, 1), (2, 3)) [[P:%.*]]) {
+; CHECK-NEXT:    store i16 0, ptr [[P]], align 2
+; CHECK-NEXT:    ret void
+;
+  store i16 0, ptr %p
+  ret void
+}
+
+define void @do_calls(ptr %p) {
+; CHECK-LABEL: define void @do_calls(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    call void @test1(ptr [[P]])
+; CHECK-NEXT:    call void @test1(ptr [[P]])
+; CHECK-NEXT:    call void @test3(ptr [[P]])
+; CHECK-NEXT:    call void @test4(ptr [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @test1(ptr %p)
+  call void @test2(ptr %p)
+  call void @test3(ptr %p)
+  call void @test4(ptr %p)
+  ret void
+}

>From f081ffe70165dc3a585279a74494497a48a65b15 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 6 Dec 2024 11:33:29 +0000
Subject: [PATCH 07/18] [LV] Simplify & clarify bypass handling for IV resume
 values (NFC)

Split off NFC part refactoring from
https://github.com/llvm/llvm-project/pull/110577. This simplifies and
clarifies induction resume value creation for bypass blocks.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 87 ++++++++++---------
 1 file changed, 46 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3c7c044a042719..856c096319c085 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -517,13 +517,15 @@ class InnerLoopVectorizer {
   /// iteration count in the scalar epilogue, from where the vectorized loop
   /// left off. \p Step is the SCEV-expanded induction step to use. In cases
   /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
-  /// and the resume values can come from an additional bypass block, the \p
-  /// AdditionalBypass pair provides information about the bypass block and the
-  /// end value on the edge from bypass to this loop.
-  PHINode *createInductionResumeValue(
-      PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
-      ArrayRef<BasicBlock *> BypassBlocks,
-      std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
+  /// and the resume values can come from an additional bypass block,
+  /// \p MainVectorTripCount provides the trip count of the main vector loop,
+  /// used to compute the resume value reaching the scalar loop preheader
+  /// directly from this additional bypass block.
+  PHINode *createInductionResumeValue(PHINode *OrigPhi,
+                                      const InductionDescriptor &ID,
+                                      Value *Step,
+                                      ArrayRef<BasicBlock *> BypassBlocks,
+                                      Value *MainVectorTripCount = nullptr);
 
   /// Returns the original loop trip count.
   Value *getTripCount() const { return TripCount; }
@@ -533,6 +535,14 @@ class InnerLoopVectorizer {
   /// count of the original loop for both main loop and epilogue vectorization.
   void setTripCount(Value *TC) { TripCount = TC; }
 
+  /// Return the additional bypass block which targets the scalar loop by
+  /// skipping the epilogue loop after completing the main loop.
+  BasicBlock *getAdditionalBypassBlock() const {
+    assert(AdditionalBypassBlock &&
+           "Trying to access AdditionalBypassBlock but it has not been set");
+    return AdditionalBypassBlock;
+  }
+
 protected:
   friend class LoopVectorizationPlanner;
 
@@ -568,13 +578,11 @@ class InnerLoopVectorizer {
 
   /// Create new phi nodes for the induction variables to resume iteration count
   /// in the scalar epilogue, from where the vectorized loop left off.
-  /// In cases where the loop skeleton is more complicated (eg. epilogue
-  /// vectorization) and the resume values can come from an additional bypass
-  /// block, the \p AdditionalBypass pair provides information about the bypass
-  /// block and the end value on the edge from bypass to this loop.
-  void createInductionResumeValues(
-      const SCEV2ValueTy &ExpandedSCEVs,
-      std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
+  /// In cases where the loop skeleton is more complicated (i.e. epilogue
+  /// vectorization), \p MainVectorTripCount provides the trip count of the main
+  /// loop, used to compute these resume values.
+  void createInductionResumeValues(const SCEV2ValueTy &ExpandedSCEVs,
+                                   Value *MainVectorTripCount = nullptr);
 
   /// Allow subclasses to override and print debug traces before/after vplan
   /// execution, when trace information is requested.
@@ -664,6 +672,11 @@ class InnerLoopVectorizer {
   /// for cleaning the checks, if vectorization turns out unprofitable.
   GeneratedRTChecks &RTChecks;
 
+  /// The additional bypass block which conditionally skips over the epilogue
+  /// loop after executing the main loop. Needed to resume inductions and
+  /// reductions during epilogue vectorization.
+  BasicBlock *AdditionalBypassBlock = nullptr;
+
   VPlan &Plan;
 };
 
@@ -2582,18 +2595,16 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
 
 PHINode *InnerLoopVectorizer::createInductionResumeValue(
     PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
-    ArrayRef<BasicBlock *> BypassBlocks,
-    std::pair<BasicBlock *, Value *> AdditionalBypass) {
+    ArrayRef<BasicBlock *> BypassBlocks, Value *MainVectorTripCount) {
   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
   assert(VectorTripCount && "Expected valid arguments");
 
   Instruction *OldInduction = Legal->getPrimaryInduction();
-  Value *EndValue = nullptr;
-  Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
-  if (OrigPhi == OldInduction) {
-    // We know what the end value is.
-    EndValue = VectorTripCount;
-  } else {
+  // For the primary induction the end values are known.
+  Value *EndValue = VectorTripCount;
+  Value *EndValueFromAdditionalBypass = MainVectorTripCount;
+  // Otherwise compute them accordingly.
+  if (OrigPhi != OldInduction) {
     IRBuilder<> B(LoopVectorPreHeader->getTerminator());
 
     // Fast-math-flags propagate from the original induction instruction.
@@ -2605,12 +2616,12 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
     EndValue->setName("ind.end");
 
     // Compute the end value for the additional bypass (if applicable).
-    if (AdditionalBypass.first) {
-      B.SetInsertPoint(AdditionalBypass.first,
-                       AdditionalBypass.first->getFirstInsertionPt());
+    if (MainVectorTripCount) {
+      B.SetInsertPoint(getAdditionalBypassBlock(),
+                       getAdditionalBypassBlock()->getFirstInsertionPt());
       EndValueFromAdditionalBypass =
-          emitTransformedIndex(B, AdditionalBypass.second, II.getStartValue(),
-                               Step, II.getKind(), II.getInductionBinOp());
+          emitTransformedIndex(B, MainVectorTripCount, II.getStartValue(), Step,
+                               II.getKind(), II.getInductionBinOp());
       EndValueFromAdditionalBypass->setName("ind.end");
     }
   }
@@ -2632,8 +2643,8 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
   for (BasicBlock *BB : BypassBlocks)
     BCResumeVal->addIncoming(II.getStartValue(), BB);
 
-  if (AdditionalBypass.first)
-    BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
+  if (MainVectorTripCount)
+    BCResumeVal->setIncomingValueForBlock(getAdditionalBypassBlock(),
                                           EndValueFromAdditionalBypass);
   return BCResumeVal;
 }
@@ -2653,11 +2664,7 @@ static Value *getExpandedStep(const InductionDescriptor &ID,
 }
 
 void InnerLoopVectorizer::createInductionResumeValues(
-    const SCEV2ValueTy &ExpandedSCEVs,
-    std::pair<BasicBlock *, Value *> AdditionalBypass) {
-  assert(((AdditionalBypass.first && AdditionalBypass.second) ||
-          (!AdditionalBypass.first && !AdditionalBypass.second)) &&
-         "Inconsistent information about additional bypass.");
+    const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
   // We are going to resume the execution of the scalar loop.
   // Go over all of the induction variables that we found and fix the
   // PHIs that are left in the scalar version of the loop.
@@ -2670,7 +2677,7 @@ void InnerLoopVectorizer::createInductionResumeValues(
     const InductionDescriptor &II = InductionEntry.second;
     PHINode *BCResumeVal = createInductionResumeValue(
         OrigPhi, II, getExpandedStep(II, ExpandedSCEVs), LoopBypassBlocks,
-        AdditionalBypass);
+        MainVectorTripCount);
     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
   }
 }
@@ -7918,6 +7925,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
                  nullptr, "vec.epilog.iter.check", true);
   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
                                           VecEpilogueIterationCountCheck);
+  AdditionalBypassBlock = VecEpilogueIterationCountCheck;
 
   // Adjust the control flow taking the state info from the main loop
   // vectorization into account.
@@ -8002,11 +8010,8 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
   // iterations left once the vector loop has completed.
   // Note that when the vectorized epilogue is skipped due to iteration count
   // check, then the resume value for the induction variable comes from
-  // the trip count of the main vector loop, hence passing the AdditionalBypass
-  // argument.
-  createInductionResumeValues(ExpandedSCEVs,
-                              {VecEpilogueIterationCountCheck,
-                               EPI.VectorTripCount} /* AdditionalBypass */);
+  // the trip count of the main vector loop, passed as the second argument.
+  createInductionResumeValues(ExpandedSCEVs, EPI.VectorTripCount);
 
   return {LoopVectorPreHeader, EPResumeVal};
 }
@@ -10325,7 +10330,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
             ResumeV = MainILV.createInductionResumeValue(
                 IndPhi, *ID, getExpandedStep(*ID, ExpandedSCEVs),
-                {EPI.MainLoopIterationCountCheck});
+                EPI.MainLoopIterationCountCheck);
           }
           assert(ResumeV && "Must have a resume value");
           VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(ResumeV);

>From ce4801d373df1c2ddcc602add133066640c7073d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 6 Dec 2024 10:30:17 +0000
Subject: [PATCH 08/18] [X86] vpdpwssd.ll - add test coverage for #118443

---
 llvm/test/CodeGen/X86/vpdpwssd.ll | 169 +++++++++++++++++++++++++++++-
 1 file changed, 166 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/X86/vpdpwssd.ll b/llvm/test/CodeGen/X86/vpdpwssd.ll
index 3c1eb92e9e3c3f..c2c59e6be87977 100644
--- a/llvm/test/CodeGen/X86/vpdpwssd.ll
+++ b/llvm/test/CodeGen/X86/vpdpwssd.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,ZNVER
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5 | FileCheck %s --check-prefixes=CHECK,ZNVER
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s --check-prefixes=CHECK,AVX512-VNNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+fast-dpwssd | FileCheck %s --check-prefixes=CHECK,AVX512VL-VNNI
 
 define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
 ; CHECK-LABEL: vpdpwssd_test:
@@ -11,3 +12,165 @@ define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
   %4 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
   ret <16 x i32> %4
 }
+
+define <16 x i32> @vpdpwssd_v16i32_accumulate(<32 x i16> %a0, <32 x i16> %a1, <16 x i32> %a2) {
+; ZNVER-LABEL: vpdpwssd_v16i32_accumulate:
+; ZNVER:       # %bb.0:
+; ZNVER-NEXT:    vpmovsxwd %ymm0, %zmm3
+; ZNVER-NEXT:    vpmovsxwd %ymm1, %zmm4
+; ZNVER-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; ZNVER-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; ZNVER-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
+; ZNVER-NEXT:    vpmovsxwd %ymm0, %zmm0
+; ZNVER-NEXT:    vpmovsxwd %ymm1, %zmm1
+; ZNVER-NEXT:    vpmulld %zmm4, %zmm3, %zmm3
+; ZNVER-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
+; ZNVER-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; ZNVER-NEXT:    vpermi2d %zmm0, %zmm3, %zmm5
+; ZNVER-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
+; ZNVER-NEXT:    vpaddd %zmm2, %zmm5, %zmm0
+; ZNVER-NEXT:    vpaddd %zmm4, %zmm0, %zmm0
+; ZNVER-NEXT:    retq
+;
+; AVX512-VNNI-LABEL: vpdpwssd_v16i32_accumulate:
+; AVX512-VNNI:       # %bb.0:
+; AVX512-VNNI-NEXT:    vpmovsxwd %ymm0, %zmm3
+; AVX512-VNNI-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-VNNI-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512-VNNI-NEXT:    vpmovsxwd %ymm1, %zmm4
+; AVX512-VNNI-NEXT:    vpmulld %zmm4, %zmm3, %zmm3
+; AVX512-VNNI-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512-VNNI-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512-VNNI-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512-VNNI-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
+; AVX512-VNNI-NEXT:    vpermi2d %zmm0, %zmm3, %zmm1
+; AVX512-VNNI-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
+; AVX512-VNNI-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
+; AVX512-VNNI-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
+; AVX512-VNNI-NEXT:    vpaddd %zmm4, %zmm0, %zmm0
+; AVX512-VNNI-NEXT:    retq
+;
+; AVX512VL-VNNI-LABEL: vpdpwssd_v16i32_accumulate:
+; AVX512VL-VNNI:       # %bb.0:
+; AVX512VL-VNNI-NEXT:    vpmovsxwd %ymm0, %zmm3
+; AVX512VL-VNNI-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512VL-VNNI-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512VL-VNNI-NEXT:    vpmovsxwd %ymm1, %zmm4
+; AVX512VL-VNNI-NEXT:    vpmulld %zmm4, %zmm3, %zmm3
+; AVX512VL-VNNI-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; AVX512VL-VNNI-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512VL-VNNI-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512VL-VNNI-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
+; AVX512VL-VNNI-NEXT:    vpermi2d %zmm0, %zmm3, %zmm1
+; AVX512VL-VNNI-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
+; AVX512VL-VNNI-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
+; AVX512VL-VNNI-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
+; AVX512VL-VNNI-NEXT:    vpaddd %zmm4, %zmm0, %zmm0
+; AVX512VL-VNNI-NEXT:    retq
+  %x0 = sext <32 x i16> %a0 to <32 x i32>
+  %x1 = sext <32 x i16> %a1 to <32 x i32>
+  %m = mul nsw <32 x i32> %x0, %x1
+  %lo = shufflevector <32 x i32> %m, <32 x i32> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %hi = shufflevector <32 x i32> %m, <32 x i32> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %r0 = add <16 x i32> %lo, %a2
+  %r1 = add <16 x i32> %r0, %hi
+  ret <16 x i32> %r1
+}
+
+define <8 x i32> @vpdpwssd_v8i32_accumulate(<16 x i16> %a0, <16 x i16> %a1, <8 x i32> %a2) {
+; ZNVER-LABEL: vpdpwssd_v8i32_accumulate:
+; ZNVER:       # %bb.0:
+; ZNVER-NEXT:    vpmovsxwd %ymm0, %zmm0
+; ZNVER-NEXT:    vpmovsxwd %ymm1, %zmm1
+; ZNVER-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; ZNVER-NEXT:    vpmovqd %zmm0, %ymm1
+; ZNVER-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; ZNVER-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
+; ZNVER-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; ZNVER-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
+; ZNVER-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; ZNVER-NEXT:    retq
+;
+; AVX512-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
+; AVX512-VNNI:       # %bb.0:
+; AVX512-VNNI-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512-VNNI-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512-VNNI-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512-VNNI-NEXT:    vpmovqd %zmm0, %ymm1
+; AVX512-VNNI-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512-VNNI-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
+; AVX512-VNNI-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512-VNNI-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; AVX512-VNNI-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX512-VNNI-NEXT:    retq
+;
+; AVX512VL-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
+; AVX512VL-VNNI:       # %bb.0:
+; AVX512VL-VNNI-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512VL-VNNI-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512VL-VNNI-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512VL-VNNI-NEXT:    vpmovqd %zmm0, %ymm1
+; AVX512VL-VNNI-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-VNNI-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
+; AVX512VL-VNNI-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512VL-VNNI-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; AVX512VL-VNNI-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX512VL-VNNI-NEXT:    retq
+  %x0 = sext <16 x i16> %a0 to <16 x i32>
+  %x1 = sext <16 x i16> %a1 to <16 x i32>
+  %m = mul nsw <16 x i32> %x0, %x1
+  %lo = shufflevector <16 x i32> %m, <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %hi = shufflevector <16 x i32> %m, <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %r0 = add <8 x i32> %hi, %a2
+  %r1 = add <8 x i32> %lo, %r0
+  ret <8 x i32> %r1
+}
+
+define <4 x i32> @vpdpwssd_v4i32_accumulate(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
+; ZNVER-LABEL: vpdpwssd_v4i32_accumulate:
+; ZNVER:       # %bb.0:
+; ZNVER-NEXT:    vpmovsxwd %xmm0, %ymm0
+; ZNVER-NEXT:    vpmovsxwd %xmm1, %ymm1
+; ZNVER-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; ZNVER-NEXT:    vpmovqd %ymm0, %xmm1
+; ZNVER-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; ZNVER-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
+; ZNVER-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; ZNVER-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; ZNVER-NEXT:    vzeroupper
+; ZNVER-NEXT:    retq
+;
+; AVX512-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
+; AVX512-VNNI:       # %bb.0:
+; AVX512-VNNI-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512-VNNI-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX512-VNNI-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX512-VNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512-VNNI-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[0,2],xmm1[0,2]
+; AVX512-VNNI-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX512-VNNI-NEXT:    vpaddd %xmm2, %xmm3, %xmm1
+; AVX512-VNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-VNNI-NEXT:    vzeroupper
+; AVX512-VNNI-NEXT:    retq
+;
+; AVX512VL-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
+; AVX512VL-VNNI:       # %bb.0:
+; AVX512VL-VNNI-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX512VL-VNNI-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX512VL-VNNI-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX512VL-VNNI-NEXT:    vpmovqd %ymm0, %xmm1
+; AVX512VL-VNNI-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX512VL-VNNI-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
+; AVX512VL-VNNI-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX512VL-VNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; AVX512VL-VNNI-NEXT:    vzeroupper
+; AVX512VL-VNNI-NEXT:    retq
+  %x0 = sext <8 x i16> %a0 to <8 x i32>
+  %x1 = sext <8 x i16> %a1 to <8 x i32>
+  %m = mul nsw <8 x i32> %x0, %x1
+  %lo = shufflevector <8 x i32> %m, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %hi = shufflevector <8 x i32> %m, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %r0 = add <4 x i32> %lo, %a2
+  %r1 = add <4 x i32> %hi, %r0
+  ret <4 x i32> %r1
+}

>From 9d2351ab9aff3741e3f4e10ab7ebabc77a6079d6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 6 Dec 2024 11:33:35 +0000
Subject: [PATCH 09/18] [X86] matchPMADDWD - add matching for (add (X, (pmaddwd
 Y, Z)) reassociation patterns.

Allows us to match pmaddwd accumulation patterns, and folding to vpdpwssd instructions on VNNI targets

Fixes #118433
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  12 +-
 llvm/test/CodeGen/X86/vpdpwssd.ll       | 150 ++++++++----------------
 2 files changed, 55 insertions(+), 107 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f713f2ed209e1c..ff21aa975033cf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -56471,9 +56471,12 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,
       !isPowerOf2_32(VT.getVectorNumElements()))
     return SDValue();
 
-  SDValue Op0, Op1;
+  SDValue Op0, Op1, Accum;
   if (!sd_match(N, m_Add(m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op0)),
-                         m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op1)))))
+                         m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op1)))) &&
+      !sd_match(N, m_Add(m_AllOf(m_Opc(ISD::BUILD_VECTOR), m_Value(Op0)),
+                         m_Add(m_Value(Accum), m_AllOf(m_Opc(ISD::BUILD_VECTOR),
+                                                       m_Value(Op1))))))
     return SDValue();
 
   // Check if one of Op0,Op1 is of the form:
@@ -56549,7 +56552,10 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDNode *N,
                                  InVT.getVectorNumElements() / 2);
     return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
   };
-  return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
+  SDValue R = SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDBuilder);
+  if (Accum)
+    R = DAG.getNode(ISD::ADD, DL, VT, R, Accum);
+  return R;
 }
 
 // Attempt to turn this pattern into PMADDWD.
diff --git a/llvm/test/CodeGen/X86/vpdpwssd.ll b/llvm/test/CodeGen/X86/vpdpwssd.ll
index c2c59e6be87977..f7cd6f8f1b8961 100644
--- a/llvm/test/CodeGen/X86/vpdpwssd.ll
+++ b/llvm/test/CodeGen/X86/vpdpwssd.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,ZNVER
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5 | FileCheck %s --check-prefixes=CHECK,ZNVER
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s --check-prefixes=CHECK,ZNVER,AVX512BW-VNNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5 | FileCheck %s --check-prefixes=CHECK,ZNVER,AVX-VNNI
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s --check-prefixes=CHECK,AVX512-VNNI
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+fast-dpwssd | FileCheck %s --check-prefixes=CHECK,AVX512VL-VNNI
 
@@ -16,56 +16,28 @@ define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
 define <16 x i32> @vpdpwssd_v16i32_accumulate(<32 x i16> %a0, <32 x i16> %a1, <16 x i32> %a2) {
 ; ZNVER-LABEL: vpdpwssd_v16i32_accumulate:
 ; ZNVER:       # %bb.0:
-; ZNVER-NEXT:    vpmovsxwd %ymm0, %zmm3
-; ZNVER-NEXT:    vpmovsxwd %ymm1, %zmm4
-; ZNVER-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; ZNVER-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; ZNVER-NEXT:    vpmovsxbd {{.*#+}} zmm5 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; ZNVER-NEXT:    vpmovsxwd %ymm0, %zmm0
-; ZNVER-NEXT:    vpmovsxwd %ymm1, %zmm1
-; ZNVER-NEXT:    vpmulld %zmm4, %zmm3, %zmm3
-; ZNVER-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
-; ZNVER-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
-; ZNVER-NEXT:    vpermi2d %zmm0, %zmm3, %zmm5
-; ZNVER-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
-; ZNVER-NEXT:    vpaddd %zmm2, %zmm5, %zmm0
-; ZNVER-NEXT:    vpaddd %zmm4, %zmm0, %zmm0
+; ZNVER-NEXT:    vpdpwssd %zmm1, %zmm0, %zmm2
+; ZNVER-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; ZNVER-NEXT:    retq
 ;
 ; AVX512-VNNI-LABEL: vpdpwssd_v16i32_accumulate:
 ; AVX512-VNNI:       # %bb.0:
-; AVX512-VNNI-NEXT:    vpmovsxwd %ymm0, %zmm3
-; AVX512-VNNI-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512-VNNI-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512-VNNI-NEXT:    vpmovsxwd %ymm1, %zmm4
-; AVX512-VNNI-NEXT:    vpmulld %zmm4, %zmm3, %zmm3
-; AVX512-VNNI-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512-VNNI-NEXT:    vpmovsxwd %ymm1, %zmm1
-; AVX512-VNNI-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
-; AVX512-VNNI-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; AVX512-VNNI-NEXT:    vpermi2d %zmm0, %zmm3, %zmm1
-; AVX512-VNNI-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
-; AVX512-VNNI-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
-; AVX512-VNNI-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
-; AVX512-VNNI-NEXT:    vpaddd %zmm4, %zmm0, %zmm0
+; AVX512-VNNI-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-VNNI-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
+; AVX512-VNNI-NEXT:    vpmaddwd %ymm3, %ymm4, %ymm3
+; AVX512-VNNI-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
+; AVX512-VNNI-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512-VNNI-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
 ; AVX512-VNNI-NEXT:    retq
 ;
 ; AVX512VL-VNNI-LABEL: vpdpwssd_v16i32_accumulate:
 ; AVX512VL-VNNI:       # %bb.0:
-; AVX512VL-VNNI-NEXT:    vpmovsxwd %ymm0, %zmm3
-; AVX512VL-VNNI-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
-; AVX512VL-VNNI-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512VL-VNNI-NEXT:    vpmovsxwd %ymm1, %zmm4
-; AVX512VL-VNNI-NEXT:    vpmulld %zmm4, %zmm3, %zmm3
-; AVX512VL-VNNI-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; AVX512VL-VNNI-NEXT:    vpmovsxwd %ymm1, %zmm1
-; AVX512VL-VNNI-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
-; AVX512VL-VNNI-NEXT:    vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; AVX512VL-VNNI-NEXT:    vpermi2d %zmm0, %zmm3, %zmm1
-; AVX512VL-VNNI-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
-; AVX512VL-VNNI-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
-; AVX512VL-VNNI-NEXT:    vpaddd %zmm2, %zmm1, %zmm0
-; AVX512VL-VNNI-NEXT:    vpaddd %zmm4, %zmm0, %zmm0
+; AVX512VL-VNNI-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-VNNI-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
+; AVX512VL-VNNI-NEXT:    vpmaddwd %ymm3, %ymm4, %ymm3
+; AVX512VL-VNNI-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
+; AVX512VL-VNNI-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512VL-VNNI-NEXT:    vpaddd %zmm2, %zmm0, %zmm0
 ; AVX512VL-VNNI-NEXT:    retq
   %x0 = sext <32 x i16> %a0 to <32 x i32>
   %x1 = sext <32 x i16> %a1 to <32 x i32>
@@ -78,43 +50,28 @@ define <16 x i32> @vpdpwssd_v16i32_accumulate(<32 x i16> %a0, <32 x i16> %a1, <1
 }
 
 define <8 x i32> @vpdpwssd_v8i32_accumulate(<16 x i16> %a0, <16 x i16> %a1, <8 x i32> %a2) {
-; ZNVER-LABEL: vpdpwssd_v8i32_accumulate:
-; ZNVER:       # %bb.0:
-; ZNVER-NEXT:    vpmovsxwd %ymm0, %zmm0
-; ZNVER-NEXT:    vpmovsxwd %ymm1, %zmm1
-; ZNVER-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
-; ZNVER-NEXT:    vpmovqd %zmm0, %ymm1
-; ZNVER-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; ZNVER-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
-; ZNVER-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; ZNVER-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
-; ZNVER-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
-; ZNVER-NEXT:    retq
+; AVX512BW-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
+; AVX512BW-VNNI:       # %bb.0:
+; AVX512BW-VNNI-NEXT:    vpdpwssd %ymm1, %ymm0, %ymm2
+; AVX512BW-VNNI-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX512BW-VNNI-NEXT:    retq
+;
+; AVX-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
+; AVX-VNNI:       # %bb.0:
+; AVX-VNNI-NEXT:    {vex} vpdpwssd %ymm1, %ymm0, %ymm2
+; AVX-VNNI-NEXT:    vmovdqa %ymm2, %ymm0
+; AVX-VNNI-NEXT:    retq
 ;
 ; AVX512-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
 ; AVX512-VNNI:       # %bb.0:
-; AVX512-VNNI-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512-VNNI-NEXT:    vpmovsxwd %ymm1, %zmm1
-; AVX512-VNNI-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
-; AVX512-VNNI-NEXT:    vpmovqd %zmm0, %ymm1
-; AVX512-VNNI-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512-VNNI-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
-; AVX512-VNNI-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512-VNNI-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
-; AVX512-VNNI-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX512-VNNI-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
+; AVX512-VNNI-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
 ; AVX512-VNNI-NEXT:    retq
 ;
 ; AVX512VL-VNNI-LABEL: vpdpwssd_v8i32_accumulate:
 ; AVX512VL-VNNI:       # %bb.0:
-; AVX512VL-VNNI-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512VL-VNNI-NEXT:    vpmovsxwd %ymm1, %zmm1
-; AVX512VL-VNNI-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
-; AVX512VL-VNNI-NEXT:    vpmovqd %zmm0, %ymm1
-; AVX512VL-VNNI-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-VNNI-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
-; AVX512VL-VNNI-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VL-VNNI-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
-; AVX512VL-VNNI-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX512VL-VNNI-NEXT:    vpdpwssd %ymm1, %ymm0, %ymm2
+; AVX512VL-VNNI-NEXT:    vmovdqa %ymm2, %ymm0
 ; AVX512VL-VNNI-NEXT:    retq
   %x0 = sext <16 x i16> %a0 to <16 x i32>
   %x1 = sext <16 x i16> %a1 to <16 x i32>
@@ -127,43 +84,28 @@ define <8 x i32> @vpdpwssd_v8i32_accumulate(<16 x i16> %a0, <16 x i16> %a1, <8 x
 }
 
 define <4 x i32> @vpdpwssd_v4i32_accumulate(<8 x i16> %a0, <8 x i16> %a1, <4 x i32> %a2) {
-; ZNVER-LABEL: vpdpwssd_v4i32_accumulate:
-; ZNVER:       # %bb.0:
-; ZNVER-NEXT:    vpmovsxwd %xmm0, %ymm0
-; ZNVER-NEXT:    vpmovsxwd %xmm1, %ymm1
-; ZNVER-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
-; ZNVER-NEXT:    vpmovqd %ymm0, %xmm1
-; ZNVER-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; ZNVER-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
-; ZNVER-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; ZNVER-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; ZNVER-NEXT:    vzeroupper
-; ZNVER-NEXT:    retq
+; AVX512BW-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
+; AVX512BW-VNNI:       # %bb.0:
+; AVX512BW-VNNI-NEXT:    vpdpwssd %xmm1, %xmm0, %xmm2
+; AVX512BW-VNNI-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX512BW-VNNI-NEXT:    retq
+;
+; AVX-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
+; AVX-VNNI:       # %bb.0:
+; AVX-VNNI-NEXT:    {vex} vpdpwssd %xmm1, %xmm0, %xmm2
+; AVX-VNNI-NEXT:    vmovdqa %xmm2, %xmm0
+; AVX-VNNI-NEXT:    retq
 ;
 ; AVX512-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
 ; AVX512-VNNI:       # %bb.0:
-; AVX512-VNNI-NEXT:    vpmovsxwd %xmm0, %ymm0
-; AVX512-VNNI-NEXT:    vpmovsxwd %xmm1, %ymm1
-; AVX512-VNNI-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
-; AVX512-VNNI-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-VNNI-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[0,2],xmm1[0,2]
-; AVX512-VNNI-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX512-VNNI-NEXT:    vpaddd %xmm2, %xmm3, %xmm1
-; AVX512-VNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-VNNI-NEXT:    vzeroupper
+; AVX512-VNNI-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
+; AVX512-VNNI-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; AVX512-VNNI-NEXT:    retq
 ;
 ; AVX512VL-VNNI-LABEL: vpdpwssd_v4i32_accumulate:
 ; AVX512VL-VNNI:       # %bb.0:
-; AVX512VL-VNNI-NEXT:    vpmovsxwd %xmm0, %ymm0
-; AVX512VL-VNNI-NEXT:    vpmovsxwd %xmm1, %ymm1
-; AVX512VL-VNNI-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
-; AVX512VL-VNNI-NEXT:    vpmovqd %ymm0, %xmm1
-; AVX512VL-VNNI-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512VL-VNNI-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
-; AVX512VL-VNNI-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX512VL-VNNI-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX512VL-VNNI-NEXT:    vzeroupper
+; AVX512VL-VNNI-NEXT:    vpdpwssd %xmm1, %xmm0, %xmm2
+; AVX512VL-VNNI-NEXT:    vmovdqa %xmm2, %xmm0
 ; AVX512VL-VNNI-NEXT:    retq
   %x0 = sext <8 x i16> %a0 to <8 x i32>
   %x1 = sext <8 x i16> %a1 to <8 x i32>

>From 76db47335903cb65d3027c0a77658f488d8ce659 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Fri, 6 Dec 2024 11:38:02 +0000
Subject: [PATCH 10/18] [AArch64] Add bf16 instruction coverage. NFC

These are the same tests as fp16-instructions.ll, fp16-v4-instructions.ll and
fp16-v8-instruction.ll ported to bf16.
---
 .../test/CodeGen/AArch64/bf16-instructions.ll | 2347 +++++++++++++++++
 .../CodeGen/AArch64/bf16-v4-instructions.ll   |  711 +++++
 .../CodeGen/AArch64/bf16-v8-instructions.ll   | 2192 +++++++++++++++
 3 files changed, 5250 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/bf16-instructions.ll
 create mode 100644 llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
 create mode 100644 llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll

diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
new file mode 100644
index 00000000000000..33997614598c3a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
@@ -0,0 +1,2347 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64-unknown-unknown | FileCheck %s --check-prefixes=CHECK,CHECK-CVT
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16
+
+define bfloat @test_fadd(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_fadd:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    fadd s0, s1, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fadd:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fadd s0, s1, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = fadd bfloat %a, %b
+  ret bfloat %r
+}
+
+define bfloat @test_fsub(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_fsub:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    fsub s0, s1, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fsub:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fsub s0, s1, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = fsub bfloat %a, %b
+  ret bfloat %r
+}
+
+define bfloat @test_fmul(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_fmul:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    fmul s0, s1, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fmul:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fmul s0, s1, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = fmul bfloat %a, %b
+  ret bfloat %r
+}
+
+define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-CVT-LABEL: test_fmadd:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s1
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w10, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    fmul s0, s1, s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
+; CHECK-CVT-NEXT:    add w8, w8, w10
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    fmov w9, s2
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    fadd s0, s0, s1
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
+; CHECK-CVT-NEXT:    add w8, w8, w10
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fmadd:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fmov w9, s2
+; CHECK-BF16-NEXT:    fmul s0, s1, s0
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fadd s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %mul = fmul fast bfloat %a, %b
+  %r = fadd fast bfloat %mul, %c
+  ret bfloat %r
+}
+
+define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_fdiv:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    fdiv s0, s1, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fdiv:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fdiv s0, s1, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = fdiv bfloat %a, %b
+  ret bfloat %r
+}
+
+define bfloat @test_frem(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_frem:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    bl fmodf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_frem:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    fmov w9, s1
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    bl fmodf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = frem bfloat %a, %b
+  ret bfloat %r
+}
+
+define void @test_store(bfloat %a, ptr %b) #0 {
+; CHECK-LABEL: test_store:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str h0, [x0]
+; CHECK-NEXT:    ret
+  store bfloat %a, ptr %b
+  ret void
+}
+
+define bfloat @test_load(ptr %a) #0 {
+; CHECK-LABEL: test_load:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    ret
+  %r = load bfloat, ptr %a
+  ret bfloat %r
+}
+
+declare bfloat @test_callee(bfloat %a, bfloat %b) #0
+
+define bfloat @test_call(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_call:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    bl test_callee
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %r = call bfloat @test_callee(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define bfloat @test_call_flipped(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_call_flipped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    fmov s2, s0
+; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    fmov s1, s2
+; CHECK-NEXT:    bl test_callee
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %r = call bfloat @test_callee(bfloat %b, bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_tailcall_flipped(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_tailcall_flipped:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s2, s0
+; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    fmov s1, s2
+; CHECK-NEXT:    b test_callee
+  %r = tail call bfloat @test_callee(bfloat %b, bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_select(bfloat %a, bfloat %b, i1 zeroext %c) #0 {
+; CHECK-LABEL: test_select:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+  %r = select i1 %c, bfloat %a, bfloat %b
+  ret bfloat %r
+}
+
+define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 {
+; CHECK-LABEL: test_select_cc:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h3 killed $h3 def $s3
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+  %cc = fcmp une bfloat %c, %d
+  %r = select i1 %cc, bfloat %a, bfloat %b
+  ret bfloat %r
+}
+
+define float @test_select_cc_f32_f16(float %a, float %b, bfloat %c, bfloat %d) #0 {
+; CHECK-LABEL: test_select_cc_f32_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h3 killed $h3 def $s3
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s2
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-NEXT:    ret
+  %cc = fcmp une bfloat %c, %d
+  %r = select i1 %cc, float %a, float %b
+  ret float %r
+}
+
+define bfloat @test_select_cc_f16_f32(bfloat %a, bfloat %b, float %c, float %d) #0 {
+; CHECK-LABEL: test_select_cc_f16_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmp s2, s3
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+  %cc = fcmp une float %c, %d
+  %r = select i1 %cc, bfloat %a, bfloat %b
+  ret bfloat %r
+}
+
+define i1 @test_fcmp_une(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_une:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %r = fcmp une bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ueq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    csinc w0, w8, wzr, vc
+; CHECK-NEXT:    ret
+  %r = fcmp ueq bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ugt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, hi
+; CHECK-NEXT:    ret
+  %r = fcmp ugt bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_uge(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_uge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, pl
+; CHECK-NEXT:    ret
+  %r = fcmp uge bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ult(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ult:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, lt
+; CHECK-NEXT:    ret
+  %r = fcmp ult bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ule(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ule:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, le
+; CHECK-NEXT:    ret
+  %r = fcmp ule bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_uno:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, vs
+; CHECK-NEXT:    ret
+  %r = fcmp uno bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_one:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w8, mi
+; CHECK-NEXT:    csinc w0, w8, wzr, le
+; CHECK-NEXT:    ret
+  %r = fcmp one bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_oeq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %r = fcmp oeq bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ogt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %r = fcmp ogt bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_oge(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_oge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, ge
+; CHECK-NEXT:    ret
+  %r = fcmp oge bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_olt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, mi
+; CHECK-NEXT:    ret
+  %r = fcmp olt bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ole:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, ls
+; CHECK-NEXT:    ret
+  %r = fcmp ole bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ord(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ord:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    cset w0, vc
+; CHECK-NEXT:    ret
+  %r = fcmp ord bfloat %a, %b
+  ret i1 %r
+}
+
+define void @test_fccmp(bfloat %in, ptr %out) {
+; CHECK-LABEL: test_fccmp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    movi v1.2s, #69, lsl #24
+; CHECK-NEXT:    movi v3.2s, #72, lsl #24
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    adrp x8, .LCPI29_0
+; CHECK-NEXT:    fcmp s2, s1
+; CHECK-NEXT:    ldr h1, [x8, :lo12:.LCPI29_0]
+; CHECK-NEXT:    fccmp s2, s3, #4, mi
+; CHECK-NEXT:    fcsel s0, s0, s1, gt
+; CHECK-NEXT:    str h0, [x0]
+; CHECK-NEXT:    ret
+  %cmp1 = fcmp ogt bfloat %in, 0xR4800
+  %cmp2 = fcmp olt bfloat %in, 0xR4500
+  %cond = and i1 %cmp1, %cmp2
+  %result = select i1 %cond, bfloat %in, bfloat 0xR4500
+  store bfloat %result, ptr %out
+  ret void
+}
+
+define void @test_br_cc(bfloat %a, bfloat %b, ptr %p1, ptr %p2) #0 {
+; CHECK-LABEL: test_br_cc:
+; CHECK:       // %bb.0: // %common.ret
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csel x8, x0, x1, pl
+; CHECK-NEXT:    str wzr, [x8]
+; CHECK-NEXT:    ret
+  %c = fcmp uge bfloat %a, %b
+  br i1 %c, label %then, label %else
+then:
+  store i32 0, ptr %p1
+  ret void
+else:
+  store i32 0, ptr %p2
+  ret void
+}
+
+define bfloat @test_phi(ptr %p1) #0 {
+; CHECK-LABEL: test_phi:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp d9, d8, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    ldr h9, [x0]
+; CHECK-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:  .LBB31_1: // %loop
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    fmov s8, s9
+; CHECK-NEXT:    ldr h9, [x19]
+; CHECK-NEXT:    mov x0, x19
+; CHECK-NEXT:    bl test_dummy
+; CHECK-NEXT:    tbnz w0, #0, .LBB31_1
+; CHECK-NEXT:  // %bb.2: // %return
+; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    fmov s0, s8
+; CHECK-NEXT:    ldp d9, d8, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %a = load bfloat, ptr %p1
+  br label %loop
+loop:
+  %r = phi bfloat [%a, %entry], [%b, %loop]
+  %b = load bfloat, ptr %p1
+  %c = call i1 @test_dummy(ptr %p1)
+  br i1 %c, label %loop, label %return
+return:
+  ret bfloat %r
+}
+
+declare i1 @test_dummy(ptr %p1) #0
+
+define i32 @test_fptosi_i32(bfloat %a) #0 {
+; CHECK-LABEL: test_fptosi_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fcvtzs w0, s0
+; CHECK-NEXT:    ret
+  %r = fptosi bfloat %a to i32
+  ret i32 %r
+}
+
+define i64 @test_fptosi_i64(bfloat %a) #0 {
+; CHECK-LABEL: test_fptosi_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fcvtzs x0, s0
+; CHECK-NEXT:    ret
+  %r = fptosi bfloat %a to i64
+  ret i64 %r
+}
+
+define i32 @test_fptoui_i32(bfloat %a) #0 {
+; CHECK-LABEL: test_fptoui_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fcvtzu w0, s0
+; CHECK-NEXT:    ret
+  %r = fptoui bfloat %a to i32
+  ret i32 %r
+}
+
+define i64 @test_fptoui_i64(bfloat %a) #0 {
+; CHECK-LABEL: test_fptoui_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fcvtzu x0, s0
+; CHECK-NEXT:    ret
+  %r = fptoui bfloat %a to i64
+  ret i64 %r
+}
+
+define bfloat @test_uitofp_i32(i32 %a) #0 {
+; CHECK-CVT-LABEL: test_uitofp_i32:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ucvtf d0, w0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    fcvtxn s0, d0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_uitofp_i32:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ucvtf d0, w0
+; CHECK-BF16-NEXT:    fcvtxn s0, d0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = uitofp i32 %a to bfloat
+  ret bfloat %r
+}
+
+define bfloat @test_uitofp_i64(i64 %a) #0 {
+; CHECK-CVT-LABEL: test_uitofp_i64:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    lsr x9, x0, #53
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    cmp x9, #0
+; CHECK-CVT-NEXT:    and x9, x0, #0xfffffffffffff000
+; CHECK-CVT-NEXT:    csel x9, x9, x0, ne
+; CHECK-CVT-NEXT:    ucvtf d0, x9
+; CHECK-CVT-NEXT:    cset w9, ne
+; CHECK-CVT-NEXT:    tst x0, #0xfff
+; CHECK-CVT-NEXT:    csel w9, wzr, w9, eq
+; CHECK-CVT-NEXT:    fmov x10, d0
+; CHECK-CVT-NEXT:    orr x9, x10, x9
+; CHECK-CVT-NEXT:    fmov d0, x9
+; CHECK-CVT-NEXT:    fcvtxn s0, d0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_uitofp_i64:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    lsr x8, x0, #53
+; CHECK-BF16-NEXT:    and x9, x0, #0xfffffffffffff000
+; CHECK-BF16-NEXT:    cmp x8, #0
+; CHECK-BF16-NEXT:    csel x8, x9, x0, ne
+; CHECK-BF16-NEXT:    ucvtf d0, x8
+; CHECK-BF16-NEXT:    cset w8, ne
+; CHECK-BF16-NEXT:    tst x0, #0xfff
+; CHECK-BF16-NEXT:    csel w8, wzr, w8, eq
+; CHECK-BF16-NEXT:    fmov x9, d0
+; CHECK-BF16-NEXT:    orr x8, x9, x8
+; CHECK-BF16-NEXT:    fmov d0, x8
+; CHECK-BF16-NEXT:    fcvtxn s0, d0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = uitofp i64 %a to bfloat
+  ret bfloat %r
+}
+
+define bfloat @test_sitofp_i32(i32 %a) #0 {
+; CHECK-CVT-LABEL: test_sitofp_i32:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    scvtf d0, w0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    fcvtxn s0, d0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_sitofp_i32:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    scvtf d0, w0
+; CHECK-BF16-NEXT:    fcvtxn s0, d0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = sitofp i32 %a to bfloat
+  ret bfloat %r
+}
+
+define bfloat @test_sitofp_i64(i64 %a) #0 {
+; CHECK-CVT-LABEL: test_sitofp_i64:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    cmp x0, #0
+; CHECK-CVT-NEXT:    and x11, x0, #0x8000000000000000
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    cneg x9, x0, mi
+; CHECK-CVT-NEXT:    lsr x10, x9, #53
+; CHECK-CVT-NEXT:    cmp x10, #0
+; CHECK-CVT-NEXT:    and x10, x9, #0xfffffffffffff000
+; CHECK-CVT-NEXT:    csel x10, x10, x9, ne
+; CHECK-CVT-NEXT:    scvtf d0, x10
+; CHECK-CVT-NEXT:    cset w10, ne
+; CHECK-CVT-NEXT:    tst x9, #0xfff
+; CHECK-CVT-NEXT:    csel w10, wzr, w10, eq
+; CHECK-CVT-NEXT:    fmov x9, d0
+; CHECK-CVT-NEXT:    orr x9, x9, x11
+; CHECK-CVT-NEXT:    orr x9, x9, x10
+; CHECK-CVT-NEXT:    fmov d0, x9
+; CHECK-CVT-NEXT:    fcvtxn s0, d0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_sitofp_i64:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    cmp x0, #0
+; CHECK-BF16-NEXT:    cneg x8, x0, mi
+; CHECK-BF16-NEXT:    lsr x9, x8, #53
+; CHECK-BF16-NEXT:    and x10, x8, #0xfffffffffffff000
+; CHECK-BF16-NEXT:    cmp x9, #0
+; CHECK-BF16-NEXT:    csel x9, x10, x8, ne
+; CHECK-BF16-NEXT:    and x10, x0, #0x8000000000000000
+; CHECK-BF16-NEXT:    cset w11, ne
+; CHECK-BF16-NEXT:    scvtf d0, x9
+; CHECK-BF16-NEXT:    tst x8, #0xfff
+; CHECK-BF16-NEXT:    fmov x9, d0
+; CHECK-BF16-NEXT:    orr x8, x9, x10
+; CHECK-BF16-NEXT:    csel w9, wzr, w11, eq
+; CHECK-BF16-NEXT:    orr x8, x8, x9
+; CHECK-BF16-NEXT:    fmov d0, x8
+; CHECK-BF16-NEXT:    fcvtxn s0, d0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = sitofp i64 %a to bfloat
+  ret bfloat %r
+}
+
+define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_uitofp_i32_fadd:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ucvtf d1, w0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fcvtxn s1, d1
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w9, w9, w8
+; CHECK-CVT-NEXT:    add w9, w10, w9
+; CHECK-CVT-NEXT:    lsr w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    fmov w10, s1
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    fadd s0, s0, s1
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_uitofp_i32_fadd:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ucvtf d1, w0
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fcvtxn s1, d1
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bfcvt h1, s1
+; CHECK-BF16-NEXT:    fmov w9, s1
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fadd s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %c = uitofp i32 %a to bfloat
+  %r = fadd bfloat %b, %c
+  ret bfloat %r
+}
+
+define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_sitofp_i32_fadd:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    scvtf d1, w0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fcvtxn s1, d1
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w9, w9, w8
+; CHECK-CVT-NEXT:    add w9, w10, w9
+; CHECK-CVT-NEXT:    lsr w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    fmov w10, s1
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    fadd s0, s0, s1
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_sitofp_i32_fadd:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    scvtf d1, w0
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fcvtxn s1, d1
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bfcvt h1, s1
+; CHECK-BF16-NEXT:    fmov w9, s1
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fadd s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %c = sitofp i32 %a to bfloat
+  %r = fadd bfloat %b, %c
+  ret bfloat %r
+}
+
+define bfloat @test_fptrunc_float(float %a) #0 {
+; CHECK-CVT-LABEL: test_fptrunc_float:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    fcmp s0, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    orr w9, w9, #0x400000
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    csel w8, w9, w8, vs
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fptrunc_float:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = fptrunc float %a to bfloat
+  ret bfloat %r
+}
+
+define bfloat @test_fptrunc_double(double %a) #0 {
+; CHECK-CVT-LABEL: test_fptrunc_double:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvtxn s0, d0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fptrunc_double:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    fcvtxn s0, d0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = fptrunc double %a to bfloat
+  ret bfloat %r
+}
+
+define float @test_fpext_float(bfloat %a) #0 {
+; CHECK-LABEL: test_fpext_float:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ret
+  %r = fpext bfloat %a to float
+  ret float %r
+}
+
+define double @test_fpext_double(bfloat %a) #0 {
+; CHECK-LABEL: test_fpext_double:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fcvt d0, s0
+; CHECK-NEXT:    ret
+  %r = fpext bfloat %a to double
+  ret double %r
+}
+
+define i16 @test_bitcast_bfloattoi16(bfloat %a) #0 {
+; CHECK-LABEL: test_bitcast_bfloattoi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %r = bitcast bfloat %a to i16
+  ret i16 %r
+}
+
+define bfloat @test_bitcast_i16tobfloat(i16 %a) #0 {
+; CHECK-LABEL: test_bitcast_i16tobfloat:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+  %r = bitcast i16 %a to bfloat
+  ret bfloat %r
+}
+
+declare bfloat @llvm.sqrt.f16(bfloat %a) #0
+declare bfloat @llvm.powi.f16.i32(bfloat %a, i32 %b) #0
+declare bfloat @llvm.sin.f16(bfloat %a) #0
+declare bfloat @llvm.cos.f16(bfloat %a) #0
+declare bfloat @llvm.tan.f16(bfloat %a) #0
+declare bfloat @llvm.asin.f16(bfloat %a) #0
+declare bfloat @llvm.acos.f16(bfloat %a) #0
+declare bfloat @llvm.atan.f16(bfloat %a) #0
+declare bfloat @llvm.atan2.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.sinh.f16(bfloat %a) #0
+declare bfloat @llvm.cosh.f16(bfloat %a) #0
+declare bfloat @llvm.tanh.f16(bfloat %a) #0
+declare bfloat @llvm.pow.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.exp.f16(bfloat %a) #0
+declare bfloat @llvm.exp2.f16(bfloat %a) #0
+declare bfloat @llvm.log.f16(bfloat %a) #0
+declare bfloat @llvm.log10.f16(bfloat %a) #0
+declare bfloat @llvm.log2.f16(bfloat %a) #0
+declare bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c) #0
+declare bfloat @llvm.fabs.f16(bfloat %a) #0
+declare bfloat @llvm.minnum.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.maxnum.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.copysign.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.floor.f16(bfloat %a) #0
+declare bfloat @llvm.ceil.f16(bfloat %a) #0
+declare bfloat @llvm.trunc.f16(bfloat %a) #0
+declare bfloat @llvm.rint.f16(bfloat %a) #0
+declare bfloat @llvm.nearbyint.f16(bfloat %a) #0
+declare bfloat @llvm.round.f16(bfloat %a) #0
+declare bfloat @llvm.roundeven.f16(bfloat %a) #0
+declare bfloat @llvm.fmuladd.f16(bfloat %a, bfloat %b, bfloat %c) #0
+
+
+define bfloat @test_sqrt(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_sqrt:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    fsqrt s0, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_sqrt:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fsqrt s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.sqrt.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_powi(bfloat %a, i32 %b) #0 {
+; CHECK-CVT-LABEL: test_powi:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl __powisf2
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_powi:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl __powisf2
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.powi.f16.i32(bfloat %a, i32 %b)
+  ret bfloat %r
+}
+
+
+define bfloat @test_sin(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_sin:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl sinf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_sin:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl sinf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.sin.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_cos(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_cos:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl cosf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_cos:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl cosf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.cos.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_tan(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_tan:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl tanf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_tan:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl tanf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.tan.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_acos(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_acos:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl acosf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_acos:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl acosf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.acos.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_asin(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_asin:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl asinf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_asin:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl asinf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.asin.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_atan(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_atan:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl atanf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_atan:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl atanf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.atan.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_atan2(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_atan2:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    bl atan2f
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_atan2:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    fmov w9, s1
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    bl atan2f
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.atan2.f16(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define bfloat @test_cosh(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_cosh:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl coshf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_cosh:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl coshf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.cosh.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_sinh(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_sinh:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl sinhf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_sinh:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl sinhf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.sinh.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_tanh(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_tanh:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl tanhf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_tanh:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl tanhf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.tanh.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_pow(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_pow:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    bl powf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_pow:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    fmov w9, s1
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    bl powf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.pow.f16(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define bfloat @test_exp(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_exp:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl expf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_exp:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl expf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.exp.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_exp2(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_exp2:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl exp2f
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_exp2:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl exp2f
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.exp2.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_log(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_log:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl logf
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_log:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl logf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.log.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_log10(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_log10:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl log10f
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_log10:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl log10f
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.log10.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_log2(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_log2:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bl log2f
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_log2:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bl log2f
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.log2.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-CVT-LABEL: test_fma:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s2
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    fmov s2, w10
+; CHECK-CVT-NEXT:    mov w10, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
+; CHECK-CVT-NEXT:    add w8, w8, w10
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fma:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s2
+; CHECK-BF16-NEXT:    fmov w9, s1
+; CHECK-BF16-NEXT:    fmov w10, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    lsl w10, w10, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fmov s2, w10
+; CHECK-BF16-NEXT:    fmadd s0, s2, s1, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c)
+  ret bfloat %r
+}
+
+define bfloat @test_fabs(bfloat %a) #0 {
+; CHECK-LABEL: test_fabs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and w8, w8, #0x7fff
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+  %r = call bfloat @llvm.fabs.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_minnum(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_minnum:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    fminnm s0, s1, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_minnum:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fminnm s0, s1, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.minnum.f16(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_maxnum:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s1
+; CHECK-CVT-NEXT:    fmov w10, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    lsl w10, w10, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    fmov s1, w10
+; CHECK-CVT-NEXT:    fmaxnm s0, s1, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_maxnum:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fmaxnm s0, s1, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.maxnum.f16(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define bfloat @test_copysign(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_copysign:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s1
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_copysign:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define bfloat @test_copysign_f32(bfloat %a, float %b) #0 {
+; CHECK-CVT-LABEL: test_copysign_f32:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-CVT-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_copysign_f32:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %tb = fptrunc float %b to bfloat
+  %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %tb)
+  ret bfloat %r
+}
+
+define bfloat @test_copysign_f64(bfloat %a, double %b) #0 {
+; CHECK-CVT-LABEL: test_copysign_f64:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    fcvt s1, d1
+; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_copysign_f64:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    fcvt s1, d1
+; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %tb = fptrunc double %b to bfloat
+  %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %tb)
+  ret bfloat %r
+}
+
+; away the (fpext (fp_round <result>)) here.
+
+define float @test_copysign_extended(bfloat %a, bfloat %b) #0 {
+; CHECK-CVT-LABEL: test_copysign_extended:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s1
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_copysign_extended:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    mvni v2.4s, #128, lsl #24
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b)
+  %xr = fpext bfloat %r to float
+  ret float %xr
+}
+
+define bfloat @test_floor(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_floor:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    frintm s0, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_floor:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    frintm s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.floor.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_ceil(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_ceil:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    frintp s0, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_ceil:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    frintp s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.ceil.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_trunc(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_trunc:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    frintz s0, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_trunc:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    frintz s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.trunc.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_rint(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_rint:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    frintx s0, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_rint:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    frintx s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.rint.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_nearbyint(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_nearbyint:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    frinti s0, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_nearbyint:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    frinti s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.nearbyint.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_round(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_round:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    frinta s0, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_round:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    frinta s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.round.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_roundeven(bfloat %a) #0 {
+; CHECK-CVT-LABEL: test_roundeven:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w9
+; CHECK-CVT-NEXT:    frintn s0, s0
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_roundeven:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    frintn s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.roundeven.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) #0 {
+; CHECK-CVT-LABEL: test_fmuladd:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-NEXT:    fmov w8, s1
+; CHECK-CVT-NEXT:    fmov w9, s0
+; CHECK-CVT-NEXT:    mov w10, #32767 // =0x7fff
+; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    fmul s0, s1, s0
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
+; CHECK-CVT-NEXT:    add w8, w8, w10
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    fmov w9, s2
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w8, w8, #16
+; CHECK-CVT-NEXT:    lsl w9, w9, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    fmov s1, w9
+; CHECK-CVT-NEXT:    fadd s0, s0, s1
+; CHECK-CVT-NEXT:    fmov w8, s0
+; CHECK-CVT-NEXT:    ubfx w9, w8, #16, #1
+; CHECK-CVT-NEXT:    add w8, w8, w10
+; CHECK-CVT-NEXT:    add w8, w9, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: test_fmuladd:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $s1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-BF16-NEXT:    fmov w8, s1
+; CHECK-BF16-NEXT:    fmov w9, s0
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $s2
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    fmov w9, s2
+; CHECK-BF16-NEXT:    fmul s0, s1, s0
+; CHECK-BF16-NEXT:    lsl w9, w9, #16
+; CHECK-BF16-NEXT:    fmov s1, w9
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    fmov w8, s0
+; CHECK-BF16-NEXT:    lsl w8, w8, #16
+; CHECK-BF16-NEXT:    fmov s0, w8
+; CHECK-BF16-NEXT:    fadd s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
+  %r = call bfloat @llvm.fmuladd.f16(bfloat %a, bfloat %b, bfloat %c)
+  ret bfloat %r
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
new file mode 100644
index 00000000000000..9b6e19eba3f4e6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
@@ -0,0 +1,711 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 -mattr=-bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT
+; RUN: llc < %s -mtriple=aarch64 -mattr=+bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16
+
+define <4 x bfloat> @add_h(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-CVT-LABEL: add_h:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: add_h:
+; CHECK-BF16:       // %bb.0: // %entry
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+entry:
+
+  %0 = fadd <4 x bfloat> %a, %b
+  ret <4 x bfloat> %0
+}
+
+
+define <4 x bfloat> @build_h4(<4 x bfloat> %a) {
+; CHECK-LABEL: build_h4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #15565 // =0x3ccd
+; CHECK-NEXT:    dup v0.4h, w8
+; CHECK-NEXT:    ret
+entry:
+  ret <4 x bfloat> <bfloat 0xR3CCD, bfloat 0xR3CCD, bfloat 0xR3CCD, bfloat 0xR3CCD>
+}
+
+
+define <4 x bfloat> @sub_h(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-CVT-LABEL: sub_h:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sub_h:
+; CHECK-BF16:       // %bb.0: // %entry
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+entry:
+
+  %0 = fsub <4 x bfloat> %a, %b
+  ret <4 x bfloat> %0
+}
+
+
+define <4 x bfloat> @mul_h(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-CVT-LABEL: mul_h:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: mul_h:
+; CHECK-BF16:       // %bb.0: // %entry
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+entry:
+
+  %0 = fmul <4 x bfloat> %a, %b
+  ret <4 x bfloat> %0
+}
+
+
+define <4 x bfloat> @div_h(<4 x bfloat> %a, <4 x bfloat> %b) {
+; CHECK-CVT-LABEL: div_h:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: div_h:
+; CHECK-BF16:       // %bb.0: // %entry
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+entry:
+
+  %0 = fdiv <4 x bfloat> %a, %b
+  ret <4 x bfloat> %0
+}
+
+
+define <4 x bfloat> @load_h(ptr %a) {
+; CHECK-LABEL: load_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = load <4 x bfloat>, ptr %a, align 4
+  ret <4 x bfloat> %0
+}
+
+
+define void @store_h(ptr %a, <4 x bfloat> %b) {
+; CHECK-LABEL: store_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  store <4 x bfloat> %b, ptr %a, align 4
+  ret void
+}
+
+define <4 x bfloat> @s_to_h(<4 x float> %a) {
+; CHECK-CVT-LABEL: s_to_h:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-CVT-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: s_to_h:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = fptrunc <4 x float> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <4 x bfloat> @d_to_h(<4 x double> %a) {
+; CHECK-CVT-LABEL: d_to_h:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-CVT-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-CVT-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: d_to_h:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-BF16-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = fptrunc <4 x double> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <4 x float> @h_to_s(<4 x bfloat> %a) {
+; CHECK-LABEL: h_to_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    ret
+  %1 = fpext <4 x bfloat> %a to <4 x float>
+  ret <4 x float> %1
+}
+
+define <4 x double> @h_to_d(<4 x bfloat> %a) {
+; CHECK-LABEL: h_to_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtl2 v1.2d, v0.4s
+; CHECK-NEXT:    fcvtl v0.2d, v0.2s
+; CHECK-NEXT:    ret
+  %1 = fpext <4 x bfloat> %a to <4 x double>
+  ret <4 x double> %1
+}
+
+define <4 x bfloat> @bitcast_i_to_h(float, <4 x i16> %a) {
+; CHECK-LABEL: bitcast_i_to_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
+  %2 = bitcast <4 x i16> %a to <4 x bfloat>
+  ret <4 x bfloat> %2
+}
+
+define <4 x i16> @bitcast_h_to_i(float, <4 x bfloat> %a) {
+; CHECK-LABEL: bitcast_h_to_i:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
+  %2 = bitcast <4 x bfloat> %a to <4 x i16>
+  ret <4 x i16> %2
+}
+
+define <4 x bfloat> @sitofp_i8(<4 x i8> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i8:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-CVT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_i8:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-BF16-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <4 x i8> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <4 x bfloat> @sitofp_i16(<4 x i16> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i16:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_i16:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <4 x i16> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+
+define <4 x bfloat> @sitofp_i32(<4 x i32> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i32:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_i32:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <4 x i32> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+
+define <4 x bfloat> @sitofp_i64(<4 x i64> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i64:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-CVT-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-CVT-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-CVT-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-CVT-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_i64:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-BF16-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <4 x i64> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <4 x bfloat> @uitofp_i8(<4 x i8> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i8:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_i8:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <4 x i8> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+
+define <4 x bfloat> @uitofp_i16(<4 x i16> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i16:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_i16:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <4 x i16> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+
+define <4 x bfloat> @uitofp_i32(<4 x i32> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i32:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_i32:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <4 x i32> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+
+define <4 x bfloat> @uitofp_i64(<4 x i64> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i64:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-CVT-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-CVT-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-CVT-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-CVT-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_i64:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-BF16-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <4 x i64> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define void @test_insert_at_zero(bfloat %a, ptr %b) #0 {
+; CHECK-LABEL: test_insert_at_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    str d0, [x0]
+; CHECK-NEXT:    ret
+  %1 = insertelement <4 x bfloat> undef, bfloat %a, i64 0
+  store <4 x bfloat> %1, ptr %b, align 4
+  ret void
+}
+
+define <4 x i8> @fptosi_i8(<4 x bfloat> %a) #0 {
+; CHECK-LABEL: fptosi_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %1 = fptosi<4 x bfloat> %a to <4 x i8>
+  ret <4 x i8> %1
+}
+
+define <4 x i16> @fptosi_i16(<4 x bfloat> %a) #0 {
+; CHECK-LABEL: fptosi_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %1 = fptosi<4 x bfloat> %a to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <4 x i8> @fptoui_i8(<4 x bfloat> %a) #0 {
+; CHECK-LABEL: fptoui_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+; NOTE: fcvtzs selected here because the xtn shaves the sign bit
+  %1 = fptoui<4 x bfloat> %a to <4 x i8>
+  ret <4 x i8> %1
+}
+
+define <4 x i16> @fptoui_i16(<4 x bfloat> %a) #0 {
+; CHECK-LABEL: fptoui_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+  %1 = fptoui<4 x bfloat> %a to <4 x i16>
+  ret <4 x i16> %1
+}
+
+define <4 x i1> @test_fcmp_une(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_une:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mvn v0.16b, v0.16b
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+
+  %1 = fcmp une <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ueq(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ueq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    ret
+
+  %1 = fcmp ueq <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ugt(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ugt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    ret
+
+  %1 = fcmp ugt <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_uge(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_uge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    ret
+
+  %1 = fcmp uge <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ult(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ult:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    ret
+
+  %1 = fcmp ult <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ule(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ule:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    ret
+
+  %1 = fcmp ule <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_uno(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_uno:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmge v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    mvn v0.8b, v0.8b
+; CHECK-NEXT:    ret
+
+  %1 = fcmp uno <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_one(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_one:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmgt v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+
+  %1 = fcmp one <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_oeq(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_oeq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+
+  %1 = fcmp oeq <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ogt(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ogt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmgt v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+
+  %1 = fcmp ogt <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_oge(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_oge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmge v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+
+  %1 = fcmp oge <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_olt(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_olt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+
+  %1 = fcmp olt <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ole(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ole:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcmge v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+
+  %1 = fcmp ole <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+define <4 x i1> @test_fcmp_ord(<4 x bfloat> %a, <4 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ord:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmge v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+
+  %1 = fcmp ord <4 x bfloat> %a, %b
+  ret <4 x i1> %1
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
new file mode 100644
index 00000000000000..c03e2e5321321a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll
@@ -0,0 +1,2192 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64 -mattr=-bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-CVT
+; RUN: llc < %s -mtriple=aarch64 -mattr=+bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16
+
+define <8 x bfloat> @add_h(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-CVT-LABEL: add_h:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    shll2 v3.4s, v1.8h, #16
+; CHECK-CVT-NEXT:    shll2 v4.4s, v0.8h, #16
+; CHECK-CVT-NEXT:    movi v2.4s, #1
+; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fadd v3.4s, v4.4s, v3.4s
+; CHECK-CVT-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    and v2.16b, v5.16b, v2.16b
+; CHECK-CVT-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-CVT-NEXT:    fcmeq v5.4s, v3.4s, v3.4s
+; CHECK-CVT-NEXT:    add v4.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    orr v3.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v1.4s
+; CHECK-CVT-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    mov v2.16b, v5.16b
+; CHECK-CVT-NEXT:    bsl v2.16b, v4.16b, v3.16b
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: add_h:
+; CHECK-BF16:       // %bb.0: // %entry
+; CHECK-BF16-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll2 v1.4s, v1.8h, #16
+; CHECK-BF16-NEXT:    shll2 v0.4s, v0.8h, #16
+; CHECK-BF16-NEXT:    fadd v2.4s, v3.4s, v2.4s
+; CHECK-BF16-NEXT:    fadd v1.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
+; CHECK-BF16-NEXT:    ret
+entry:
+  %0 = fadd <8 x bfloat> %a, %b
+  ret <8 x bfloat> %0
+}
+
+
+define <8 x bfloat> @sub_h(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-CVT-LABEL: sub_h:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    shll2 v3.4s, v1.8h, #16
+; CHECK-CVT-NEXT:    shll2 v4.4s, v0.8h, #16
+; CHECK-CVT-NEXT:    movi v2.4s, #1
+; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fsub v3.4s, v4.4s, v3.4s
+; CHECK-CVT-NEXT:    fsub v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    and v2.16b, v5.16b, v2.16b
+; CHECK-CVT-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-CVT-NEXT:    fcmeq v5.4s, v3.4s, v3.4s
+; CHECK-CVT-NEXT:    add v4.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    orr v3.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v1.4s
+; CHECK-CVT-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    mov v2.16b, v5.16b
+; CHECK-CVT-NEXT:    bsl v2.16b, v4.16b, v3.16b
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sub_h:
+; CHECK-BF16:       // %bb.0: // %entry
+; CHECK-BF16-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll2 v1.4s, v1.8h, #16
+; CHECK-BF16-NEXT:    shll2 v0.4s, v0.8h, #16
+; CHECK-BF16-NEXT:    fsub v2.4s, v3.4s, v2.4s
+; CHECK-BF16-NEXT:    fsub v1.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
+; CHECK-BF16-NEXT:    ret
+entry:
+  %0 = fsub <8 x bfloat> %a, %b
+  ret <8 x bfloat> %0
+}
+
+
+define <8 x bfloat> @mul_h(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-CVT-LABEL: mul_h:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    shll2 v3.4s, v1.8h, #16
+; CHECK-CVT-NEXT:    shll2 v4.4s, v0.8h, #16
+; CHECK-CVT-NEXT:    movi v2.4s, #1
+; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    fmul v3.4s, v4.4s, v3.4s
+; CHECK-CVT-NEXT:    fmul v0.4s, v0.4s, v1.4s
+; CHECK-CVT-NEXT:    and v2.16b, v5.16b, v2.16b
+; CHECK-CVT-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-CVT-NEXT:    fcmeq v5.4s, v3.4s, v3.4s
+; CHECK-CVT-NEXT:    add v4.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    orr v3.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v1.4s
+; CHECK-CVT-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    mov v2.16b, v5.16b
+; CHECK-CVT-NEXT:    bsl v2.16b, v4.16b, v3.16b
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: mul_h:
+; CHECK-BF16:       // %bb.0: // %entry
+; CHECK-BF16-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll2 v1.4s, v1.8h, #16
+; CHECK-BF16-NEXT:    shll2 v0.4s, v0.8h, #16
+; CHECK-BF16-NEXT:    fmul v2.4s, v3.4s, v2.4s
+; CHECK-BF16-NEXT:    fmul v1.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
+; CHECK-BF16-NEXT:    ret
+entry:
+  %0 = fmul <8 x bfloat> %a, %b
+  ret <8 x bfloat> %0
+}
+
+
+define <8 x bfloat> @div_h(<8 x bfloat> %a, <8 x bfloat> %b) {
+; CHECK-CVT-LABEL: div_h:
+; CHECK-CVT:       // %bb.0: // %entry
+; CHECK-CVT-NEXT:    shll2 v2.4s, v1.8h, #16
+; CHECK-CVT-NEXT:    shll2 v3.4s, v0.8h, #16
+; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-CVT-NEXT:    movi v4.4s, #127, msl #8
+; CHECK-CVT-NEXT:    fdiv v2.4s, v3.4s, v2.4s
+; CHECK-CVT-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-CVT-NEXT:    ushr v0.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    fdiv v1.4s, v3.4s, v1.4s
+; CHECK-CVT-NEXT:    movi v3.4s, #1
+; CHECK-CVT-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-CVT-NEXT:    fcmeq v4.4s, v2.4s, v2.4s
+; CHECK-CVT-NEXT:    add v3.4s, v2.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    fcmeq v5.4s, v1.4s, v1.4s
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v1.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    bit v2.16b, v3.16b, v4.16b
+; CHECK-CVT-NEXT:    bif v0.16b, v1.16b, v5.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: div_h:
+; CHECK-BF16:       // %bb.0: // %entry
+; CHECK-BF16-NEXT:    shll v2.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v3.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll2 v1.4s, v1.8h, #16
+; CHECK-BF16-NEXT:    shll2 v0.4s, v0.8h, #16
+; CHECK-BF16-NEXT:    fdiv v2.4s, v3.4s, v2.4s
+; CHECK-BF16-NEXT:    fdiv v1.4s, v0.4s, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn2 v0.8h, v1.4s
+; CHECK-BF16-NEXT:    ret
+entry:
+  %0 = fdiv <8 x bfloat> %a, %b
+  ret <8 x bfloat> %0
+}
+
+
+define <8 x bfloat> @load_h(ptr %a) {
+; CHECK-LABEL: load_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = load <8 x bfloat>, ptr %a, align 4
+  ret <8 x bfloat> %0
+}
+
+
+define void @store_h(ptr %a, <8 x bfloat> %b) {
+; CHECK-LABEL: store_h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  store <8 x bfloat> %b, ptr %a, align 4
+  ret void
+}
+
+define <8 x bfloat> @s_to_h(<8 x float> %a) {
+; CHECK-CVT-LABEL: s_to_h:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    movi v2.4s, #1
+; CHECK-CVT-NEXT:    movi v3.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushr v4.4s, v1.4s, #16
+; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v4.16b, v4.16b, v2.16b
+; CHECK-CVT-NEXT:    add v6.4s, v1.4s, v3.4s
+; CHECK-CVT-NEXT:    and v2.16b, v5.16b, v2.16b
+; CHECK-CVT-NEXT:    add v3.4s, v0.4s, v3.4s
+; CHECK-CVT-NEXT:    fcmeq v5.4s, v1.4s, v1.4s
+; CHECK-CVT-NEXT:    orr v1.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v6.4s
+; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v2.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    bit v1.16b, v4.16b, v5.16b
+; CHECK-CVT-NEXT:    bit v0.16b, v2.16b, v6.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v1.8h
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: s_to_h:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = fptrunc <8 x float> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define <8 x bfloat> @d_to_h(<8 x double> %a) {
+; CHECK-CVT-LABEL: d_to_h:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    fcvtxn v2.2s, v2.2d
+; CHECK-CVT-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-CVT-NEXT:    fcvtxn2 v2.4s, v3.2d
+; CHECK-CVT-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v3.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushr v4.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    add v6.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    add v3.4s, v0.4s, v3.4s
+; CHECK-CVT-NEXT:    and v4.16b, v4.16b, v1.16b
+; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
+; CHECK-CVT-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v6.4s
+; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    bit v2.16b, v4.16b, v5.16b
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: d_to_h:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    fcvtxn v2.2s, v2.2d
+; CHECK-BF16-NEXT:    fcvtxn v0.2s, v0.2d
+; CHECK-BF16-NEXT:    fcvtxn2 v2.4s, v3.2d
+; CHECK-BF16-NEXT:    fcvtxn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = fptrunc <8 x double> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define <8 x float> @h_to_s(<8 x bfloat> %a) {
+; CHECK-LABEL: h_to_s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    ret
+  %1 = fpext <8 x bfloat> %a to <8 x float>
+  ret <8 x float> %1
+}
+
+define <8 x double> @h_to_d(<8 x bfloat> %a) {
+; CHECK-LABEL: h_to_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll v2.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtl v0.2d, v2.2s
+; CHECK-NEXT:    shll v4.4s, v1.4h, #16
+; CHECK-NEXT:    fcvtl2 v1.2d, v2.4s
+; CHECK-NEXT:    fcvtl2 v3.2d, v4.4s
+; CHECK-NEXT:    fcvtl v2.2d, v4.2s
+; CHECK-NEXT:    ret
+  %1 = fpext <8 x bfloat> %a to <8 x double>
+  ret <8 x double> %1
+}
+
+
+define <8 x bfloat> @bitcast_i_to_h(float, <8 x i16> %a) {
+; CHECK-LABEL: bitcast_i_to_h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %2 = bitcast <8 x i16> %a to <8 x bfloat>
+  ret <8 x bfloat> %2
+}
+
+define <8 x i16> @bitcast_h_to_i(float, <8 x bfloat> %a) {
+; CHECK-LABEL: bitcast_h_to_i:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %2 = bitcast <8 x bfloat> %a to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <4 x bfloat> @sitofp_v4i8(<4 x i8> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_v4i8:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-CVT-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_v4i8:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-BF16-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <4 x i8> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <8 x bfloat> @sitofp_v8i8(<8 x i8> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_v8i8:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v4.4s, #127, msl #8
+; CHECK-CVT-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT:    scvtf v3.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v0.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v2.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn2 v0.8h, v3.4s, v1.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_v8i8:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BF16-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <8 x i8> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define <16 x bfloat> @sitofp_v16i8(<16 x i8> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_v16i8:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    sshll2 v2.8h, v0.16b, #0
+; CHECK-CVT-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v7.4s, #127, msl #8
+; CHECK-CVT-NEXT:    sshll v3.4s, v2.4h, #0
+; CHECK-CVT-NEXT:    sshll v4.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    sshll2 v2.4s, v2.8h, #0
+; CHECK-CVT-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT:    scvtf v3.4s, v3.4s
+; CHECK-CVT-NEXT:    scvtf v4.4s, v4.4s
+; CHECK-CVT-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT:    scvtf v6.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT:    ushr v0.4s, v4.4s, #16
+; CHECK-CVT-NEXT:    ushr v16.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v17.4s, v6.4s, #16
+; CHECK-CVT-NEXT:    and v5.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT:    and v16.16b, v16.16b, v1.16b
+; CHECK-CVT-NEXT:    and v17.16b, v17.16b, v1.16b
+; CHECK-CVT-NEXT:    add v5.4s, v5.4s, v7.4s
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v7.4s
+; CHECK-CVT-NEXT:    addhn v1.4h, v3.4s, v5.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v4.4s, v0.4s
+; CHECK-CVT-NEXT:    add v3.4s, v16.4s, v7.4s
+; CHECK-CVT-NEXT:    add v4.4s, v17.4s, v7.4s
+; CHECK-CVT-NEXT:    addhn2 v1.8h, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    addhn2 v0.8h, v6.4s, v4.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_v16i8:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    sshll2 v1.8h, v0.16b, #0
+; CHECK-BF16-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BF16-NEXT:    sshll2 v2.4s, v1.8h, #0
+; CHECK-BF16-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-BF16-NEXT:    sshll2 v3.4s, v0.8h, #0
+; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-BF16-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT:    scvtf v3.4s, v3.4s
+; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v2.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v3.4h, v3.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v1.d[1], v2.d[0]
+; CHECK-BF16-NEXT:    mov v0.d[1], v3.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <16 x i8> %a to <16 x bfloat>
+  ret <16 x bfloat> %1
+}
+
+define <8 x bfloat> @sitofp_i16(<8 x i16> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i16:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    sshll v2.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    sshll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v4.4s, #127, msl #8
+; CHECK-CVT-NEXT:    scvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT:    scvtf v3.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v0.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v2.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn2 v0.8h, v3.4s, v1.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_i16:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    sshll2 v1.4s, v0.8h, #0
+; CHECK-BF16-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <8 x i16> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define <8 x bfloat> @sitofp_i32(<8 x i32> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i32:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    movi v2.4s, #1
+; CHECK-CVT-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-CVT-NEXT:    movi v5.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    ushr v4.4s, v1.4s, #16
+; CHECK-CVT-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-CVT-NEXT:    and v2.16b, v4.16b, v2.16b
+; CHECK-CVT-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-CVT-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v5.4s
+; CHECK-CVT-NEXT:    addhn2 v0.8h, v1.4s, v5.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_i32:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <8 x i32> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+
+define <8 x bfloat> @sitofp_i64(<8 x i64> %a) #0 {
+; CHECK-CVT-LABEL: sitofp_i64:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-CVT-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-CVT-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-CVT-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-CVT-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-CVT-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-CVT-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-CVT-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v3.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushr v4.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    add v6.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    add v3.4s, v0.4s, v3.4s
+; CHECK-CVT-NEXT:    and v4.16b, v4.16b, v1.16b
+; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
+; CHECK-CVT-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v6.4s
+; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    bit v2.16b, v4.16b, v5.16b
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: sitofp_i64:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    scvtf v2.2d, v2.2d
+; CHECK-BF16-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-BF16-NEXT:    scvtf v3.2d, v3.2d
+; CHECK-BF16-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-BF16-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-BF16-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = sitofp <8 x i64> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define <4 x bfloat> @uitofp_v4i8(<4 x i8> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_v4i8:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v2.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-CVT-NEXT:    movi v2.4s, #127, msl #8
+; CHECK-CVT-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v2.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_v4i8:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <4 x i8> %a to <4 x bfloat>
+  ret <4 x bfloat> %1
+}
+
+define <8 x bfloat> @uitofp_v8i8(<8 x i8> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_v8i8:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v4.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT:    ucvtf v3.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v0.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v2.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn2 v0.8h, v3.4s, v1.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_v8i8:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BF16-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <8 x i8> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define <16 x bfloat> @uitofp_v16i8(<16 x i8> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_v16i8:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ushll2 v2.8h, v0.16b, #0
+; CHECK-CVT-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v7.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushll v3.4s, v2.4h, #0
+; CHECK-CVT-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    ushll2 v2.4s, v2.8h, #0
+; CHECK-CVT-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT:    ucvtf v3.4s, v3.4s
+; CHECK-CVT-NEXT:    ucvtf v4.4s, v4.4s
+; CHECK-CVT-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT:    ucvtf v6.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT:    ushr v0.4s, v4.4s, #16
+; CHECK-CVT-NEXT:    ushr v16.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v17.4s, v6.4s, #16
+; CHECK-CVT-NEXT:    and v5.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT:    and v16.16b, v16.16b, v1.16b
+; CHECK-CVT-NEXT:    and v17.16b, v17.16b, v1.16b
+; CHECK-CVT-NEXT:    add v5.4s, v5.4s, v7.4s
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v7.4s
+; CHECK-CVT-NEXT:    addhn v1.4h, v3.4s, v5.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v4.4s, v0.4s
+; CHECK-CVT-NEXT:    add v3.4s, v16.4s, v7.4s
+; CHECK-CVT-NEXT:    add v4.4s, v17.4s, v7.4s
+; CHECK-CVT-NEXT:    addhn2 v1.8h, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    addhn2 v0.8h, v6.4s, v4.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_v16i8:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ushll2 v1.8h, v0.16b, #0
+; CHECK-BF16-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BF16-NEXT:    ushll2 v2.4s, v1.8h, #0
+; CHECK-BF16-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-BF16-NEXT:    ushll2 v3.4s, v0.8h, #0
+; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-BF16-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT:    ucvtf v3.4s, v3.4s
+; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v2.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v3.4h, v3.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v1.d[1], v2.d[0]
+; CHECK-BF16-NEXT:    mov v0.d[1], v3.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <16 x i8> %a to <16 x bfloat>
+  ret <16 x bfloat> %1
+}
+
+
+define <8 x bfloat> @uitofp_i16(<8 x i16> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i16:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ushll v2.4s, v0.4h, #0
+; CHECK-CVT-NEXT:    ushll2 v0.4s, v0.8h, #0
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v4.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ucvtf v2.4s, v2.4s
+; CHECK-CVT-NEXT:    ucvtf v3.4s, v0.4s
+; CHECK-CVT-NEXT:    ushr v0.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v5.4s, v3.4s, #16
+; CHECK-CVT-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    add v0.4s, v0.4s, v4.4s
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v4.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v2.4s, v0.4s
+; CHECK-CVT-NEXT:    addhn2 v0.8h, v3.4s, v1.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_i16:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-BF16-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BF16-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <8 x i16> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+
+define <8 x bfloat> @uitofp_i32(<8 x i32> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i32:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-CVT-NEXT:    movi v2.4s, #1
+; CHECK-CVT-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-CVT-NEXT:    movi v5.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushr v3.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    ushr v4.4s, v1.4s, #16
+; CHECK-CVT-NEXT:    and v3.16b, v3.16b, v2.16b
+; CHECK-CVT-NEXT:    and v2.16b, v4.16b, v2.16b
+; CHECK-CVT-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-CVT-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-CVT-NEXT:    addhn v0.4h, v0.4s, v5.4s
+; CHECK-CVT-NEXT:    addhn2 v0.8h, v1.4s, v5.4s
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_i32:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-BF16-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v1.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <8 x i32> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+
+define <8 x bfloat> @uitofp_i64(<8 x i64> %a) #0 {
+; CHECK-CVT-LABEL: uitofp_i64:
+; CHECK-CVT:       // %bb.0:
+; CHECK-CVT-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-CVT-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-CVT-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-CVT-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-CVT-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-CVT-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-CVT-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-CVT-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-CVT-NEXT:    movi v1.4s, #1
+; CHECK-CVT-NEXT:    movi v3.4s, #127, msl #8
+; CHECK-CVT-NEXT:    ushr v4.4s, v2.4s, #16
+; CHECK-CVT-NEXT:    ushr v5.4s, v0.4s, #16
+; CHECK-CVT-NEXT:    add v6.4s, v2.4s, v3.4s
+; CHECK-CVT-NEXT:    add v3.4s, v0.4s, v3.4s
+; CHECK-CVT-NEXT:    and v4.16b, v4.16b, v1.16b
+; CHECK-CVT-NEXT:    and v1.16b, v5.16b, v1.16b
+; CHECK-CVT-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
+; CHECK-CVT-NEXT:    orr v2.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    add v4.4s, v4.4s, v6.4s
+; CHECK-CVT-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-CVT-NEXT:    add v1.4s, v1.4s, v3.4s
+; CHECK-CVT-NEXT:    orr v0.4s, #64, lsl #16
+; CHECK-CVT-NEXT:    bit v2.16b, v4.16b, v5.16b
+; CHECK-CVT-NEXT:    bit v0.16b, v1.16b, v6.16b
+; CHECK-CVT-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
+; CHECK-CVT-NEXT:    ret
+;
+; CHECK-BF16-LABEL: uitofp_i64:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    ucvtf v2.2d, v2.2d
+; CHECK-BF16-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-BF16-NEXT:    ucvtf v3.2d, v3.2d
+; CHECK-BF16-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-BF16-NEXT:    fcvtn v2.2s, v2.2d
+; CHECK-BF16-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-BF16-NEXT:    fcvtn2 v2.4s, v3.2d
+; CHECK-BF16-NEXT:    fcvtn2 v0.4s, v1.2d
+; CHECK-BF16-NEXT:    bfcvtn v1.4h, v2.4s
+; CHECK-BF16-NEXT:    bfcvtn v0.4h, v0.4s
+; CHECK-BF16-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-BF16-NEXT:    ret
+  %1 = uitofp <8 x i64> %a to <8 x bfloat>
+  ret <8 x bfloat> %1
+}
+
+define void @test_insert_at_zero(bfloat %a, ptr %b) #0 {
+; CHECK-LABEL: test_insert_at_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
+  %1 = insertelement <8 x bfloat> undef, bfloat %a, i64 0
+  store <8 x bfloat> %1, ptr %b, align 4
+  ret void
+}
+
+define <8 x i8> @fptosi_i8(<8 x bfloat> %a) #0 {
+; CHECK-LABEL: fptosi_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %1 = fptosi<8 x bfloat> %a to <8 x i8>
+  ret <8 x i8> %1
+}
+
+define <8 x i16> @fptosi_i16(<8 x bfloat> %a) #0 {
+; CHECK-LABEL: fptosi_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %1 = fptosi<8 x bfloat> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <8 x i8> @fptoui_i8(<8 x bfloat> %a) #0 {
+; CHECK-LABEL: fptoui_i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+  %1 = fptoui<8 x bfloat> %a to <8 x i8>
+  ret <8 x i8> %1
+}
+
+define <8 x i16> @fptoui_i16(<8 x bfloat> %a) #0 {
+; CHECK-LABEL: fptoui_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcvtzu v0.4s, v0.4s
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    fcvtzu v1.4s, v1.4s
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %1 = fptoui<8 x bfloat> %a to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_une:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, ne
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, ne
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, ne
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp une <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ueq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w11, s0
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    lsl w9, w11, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s7, w9
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    csetm w10, eq
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csinv w10, w10, wzr, vc
+; CHECK-NEXT:    fcmp s7, s6
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w11, s4
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h4, v0.h[4]
+; CHECK-NEXT:    mov h7, v1.h[5]
+; CHECK-NEXT:    csetm w9, eq
+; CHECK-NEXT:    csinv w9, w9, wzr, vc
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    lsl w11, w11, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s5, w11
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    mov v2.h[1], w10
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov s6, w10
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fmov w9, s7
+; CHECK-NEXT:    fmov w10, s5
+; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    fmov s6, w10
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    lsl w8, w9, #16
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    csetm w10, eq
+; CHECK-NEXT:    csinv w10, w10, wzr, vc
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    csinv w8, w8, wzr, vc
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp ueq <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ugt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, hi
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, hi
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, hi
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp ugt <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_uge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, pl
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, pl
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, pl
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp uge <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ult:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, lt
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, lt
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, lt
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp ult <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ule:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, le
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, le
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, le
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp ule <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_uno:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, vs
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, vs
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, vs
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp uno <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_one:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w11, s0
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    lsl w9, w11, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s7, w9
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    csetm w10, mi
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csinv w10, w10, wzr, le
+; CHECK-NEXT:    fcmp s7, s6
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w11, s4
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h4, v0.h[4]
+; CHECK-NEXT:    mov h7, v1.h[5]
+; CHECK-NEXT:    csetm w9, mi
+; CHECK-NEXT:    csinv w9, w9, wzr, le
+; CHECK-NEXT:    fcmp s3, s2
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    fmov s2, w9
+; CHECK-NEXT:    lsl w11, w11, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov s5, w11
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    mov v2.h[1], w10
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov s6, w10
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fmov w9, s7
+; CHECK-NEXT:    fmov w10, s5
+; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    fmov s6, w10
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    lsl w8, w9, #16
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    csetm w10, mi
+; CHECK-NEXT:    csinv w10, w10, wzr, le
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    csinv w8, w8, wzr, le
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp one <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_oeq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, eq
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, eq
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, eq
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp oeq <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ogt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, gt
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, gt
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, gt
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp ogt <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_oge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, ge
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, ge
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, ge
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp oge <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_olt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, mi
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, mi
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, mi
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp olt <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ole:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, ls
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, ls
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, ls
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp ole <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+define <8 x i1> @test_fcmp_ord(<8 x bfloat> %a, <8 x bfloat> %b) #0 {
+; CHECK-LABEL: test_fcmp_ord:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    fmov w9, s3
+; CHECK-NEXT:    mov h2, v1.h[2]
+; CHECK-NEXT:    mov h3, v0.h[2]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    mov h3, v1.h[4]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s5, s4
+; CHECK-NEXT:    fmov s5, w9
+; CHECK-NEXT:    mov h4, v1.h[3]
+; CHECK-NEXT:    lsl w10, w10, #16
+; CHECK-NEXT:    fmov s6, w8
+; CHECK-NEXT:    fmov w8, s2
+; CHECK-NEXT:    csetm w9, vc
+; CHECK-NEXT:    fmov s16, w10
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    mov h5, v0.h[3]
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    mov h6, v0.h[4]
+; CHECK-NEXT:    mov h4, v1.h[5]
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    fmov s2, w8
+; CHECK-NEXT:    fmov w8, s5
+; CHECK-NEXT:    mov h5, v0.h[5]
+; CHECK-NEXT:    fcmp s16, s7
+; CHECK-NEXT:    mov v2.h[1], w9
+; CHECK-NEXT:    lsl w9, w10, #16
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    fmov s3, w9
+; CHECK-NEXT:    fmov w9, s6
+; CHECK-NEXT:    fmov s7, w8
+; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    mov v2.h[2], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fmov w10, s4
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fcmp s7, s3
+; CHECK-NEXT:    mov h3, v1.h[6]
+; CHECK-NEXT:    fmov s4, w8
+; CHECK-NEXT:    mov h1, v1.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    fmov w9, s5
+; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    mov v2.h[3], w8
+; CHECK-NEXT:    lsl w8, w10, #16
+; CHECK-NEXT:    fcmp s6, s4
+; CHECK-NEXT:    mov h4, v0.h[6]
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s5, w8
+; CHECK-NEXT:    mov h0, v0.h[7]
+; CHECK-NEXT:    fmov s6, w9
+; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    mov v2.h[4], w8
+; CHECK-NEXT:    fmov w8, s3
+; CHECK-NEXT:    fmov w9, s4
+; CHECK-NEXT:    fcmp s6, s5
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    csetm w10, vc
+; CHECK-NEXT:    fmov s3, w8
+; CHECK-NEXT:    fmov s4, w9
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    mov v2.h[5], w10
+; CHECK-NEXT:    lsl w8, w8, #16
+; CHECK-NEXT:    fcmp s4, s3
+; CHECK-NEXT:    lsl w9, w9, #16
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    mov v2.h[6], w8
+; CHECK-NEXT:    fcmp s1, s0
+; CHECK-NEXT:    csetm w8, vc
+; CHECK-NEXT:    mov v2.h[7], w8
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    ret
+  %1 = fcmp ord <8 x bfloat> %a, %b
+  ret <8 x i1> %1
+}
+
+attributes #0 = { nounwind }

>From 99dc3967595c472b6edbe789a1346b0350294567 Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn at arm.com>
Date: Fri, 6 Dec 2024 11:40:30 +0000
Subject: [PATCH 11/18] [InstCombine] Make fptrunc combine use intersection of
 fast math flags (#118808)

These combines involve swapping the fptrunc with its operand, and using
the intersection of fast math flags is the safest option as e.g. if we
have (fptrunc (fneg ninf x)) then (fneg ninf (fptrunc x)) will not be
correct as if x is a not within the range of the destination type the
result of (fptrunc x) will be inf.
---
 .../InstCombine/InstCombineCasts.cpp          | 11 +--
 llvm/test/Transforms/InstCombine/fpcast.ll    | 88 +++++++++++++++++++
 llvm/test/Transforms/InstCombine/fptrunc.ll   | 24 +++++
 3 files changed, 118 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index d21714b10155d9..102b784169ca7d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1847,15 +1847,16 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
   Value *X;
   Instruction *Op = dyn_cast<Instruction>(FPT.getOperand(0));
   if (Op && Op->hasOneUse()) {
-    // FIXME: The FMF should propagate from the fptrunc, not the source op.
     IRBuilder<>::FastMathFlagGuard FMFG(Builder);
-    if (isa<FPMathOperator>(Op))
-      Builder.setFastMathFlags(Op->getFastMathFlags());
+    FastMathFlags FMF = FPT.getFastMathFlags();
+    if (auto *FPMO = dyn_cast<FPMathOperator>(Op))
+      FMF &= FPMO->getFastMathFlags();
+    Builder.setFastMathFlags(FMF);
 
     if (match(Op, m_FNeg(m_Value(X)))) {
       Value *InnerTrunc = Builder.CreateFPTrunc(X, Ty);
-
-      return UnaryOperator::CreateFNegFMF(InnerTrunc, Op);
+      Value *Neg = Builder.CreateFNeg(InnerTrunc);
+      return replaceInstUsesWith(FPT, Neg);
     }
 
     // If we are truncating a select that has an extended operand, we can
diff --git a/llvm/test/Transforms/InstCombine/fpcast.ll b/llvm/test/Transforms/InstCombine/fpcast.ll
index 029e513ceafbcd..d5290b572aefd9 100644
--- a/llvm/test/Transforms/InstCombine/fpcast.ll
+++ b/llvm/test/Transforms/InstCombine/fpcast.ll
@@ -29,6 +29,17 @@ define half @test3(float %a) {
   ret half %c
 }
 
+define half @test3_fast(float %a) {
+; CHECK-LABEL: @test3_fast(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
+; CHECK-NEXT:    [[C:%.*]] = call half @llvm.fabs.f16(half [[TMP1]])
+; CHECK-NEXT:    ret half [[C]]
+;
+  %b = call float @llvm.fabs.f32(float %a)
+  %c = fptrunc fast float %b to half
+  ret half %c
+}
+
 define half @fneg_fptrunc(float %a) {
 ; CHECK-LABEL: @fneg_fptrunc(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
@@ -78,6 +89,28 @@ define half @test4-fast(float %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc fast float [[A:%.*]] to half
 ; CHECK-NEXT:    [[C:%.*]] = fneg fast half [[TMP1]]
 ; CHECK-NEXT:    ret half [[C]]
+;
+  %b = fsub fast float -0.0, %a
+  %c = fptrunc fast float %b to half
+  ret half %c
+}
+
+define half @test4-mixed-fast-1(float %a) {
+; CHECK-LABEL: @test4-mixed-fast-1(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
+; CHECK-NEXT:    [[C:%.*]] = fneg half [[TMP1]]
+; CHECK-NEXT:    ret half [[C]]
+;
+  %b = fsub float -0.0, %a
+  %c = fptrunc fast float %b to half
+  ret half %c
+}
+
+define half @test4-mixed-fast-2(float %a) {
+; CHECK-LABEL: @test4-mixed-fast-2(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
+; CHECK-NEXT:    [[C:%.*]] = fneg half [[TMP1]]
+; CHECK-NEXT:    ret half [[C]]
 ;
   %b = fsub fast float -0.0, %a
   %c = fptrunc float %b to half
@@ -89,12 +122,67 @@ define half @test4_unary_fneg-fast(float %a) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc fast float [[A:%.*]] to half
 ; CHECK-NEXT:    [[C:%.*]] = fneg fast half [[TMP1]]
 ; CHECK-NEXT:    ret half [[C]]
+;
+  %b = fneg fast float %a
+  %c = fptrunc fast float %b to half
+  ret half %c
+}
+
+define half @test4_unary_fneg-mixed-fast-1(float %a) {
+; CHECK-LABEL: @test4_unary_fneg-mixed-fast-1(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
+; CHECK-NEXT:    [[C:%.*]] = fneg half [[TMP1]]
+; CHECK-NEXT:    ret half [[C]]
+;
+  %b = fneg float %a
+  %c = fptrunc fast float %b to half
+  ret half %c
+}
+
+define half @test4_unary_fneg-mixed-fast-2(float %a) {
+; CHECK-LABEL: @test4_unary_fneg-mixed-fast-2(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc float [[A:%.*]] to half
+; CHECK-NEXT:    [[C:%.*]] = fneg half [[TMP1]]
+; CHECK-NEXT:    ret half [[C]]
 ;
   %b = fneg fast float %a
   %c = fptrunc float %b to half
   ret half %c
 }
 
+define <2 x half> @test4_unary_fneg-vec-fast(<2 x float> %a) {
+; CHECK-LABEL: @test4_unary_fneg-vec-fast(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc fast <2 x float> [[A:%.*]] to <2 x half>
+; CHECK-NEXT:    [[C:%.*]] = fneg fast <2 x half> [[TMP1]]
+; CHECK-NEXT:    ret <2 x half> [[C]]
+;
+  %b = fneg fast <2 x float> %a
+  %c = fptrunc fast <2 x float> %b to <2 x half>
+  ret <2 x half> %c
+}
+
+define <2 x half> @test4_unary_fneg-vec-mixed-fast-1(<2 x float> %a) {
+; CHECK-LABEL: @test4_unary_fneg-vec-mixed-fast-1(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc <2 x float> [[A:%.*]] to <2 x half>
+; CHECK-NEXT:    [[C:%.*]] = fneg <2 x half> [[TMP1]]
+; CHECK-NEXT:    ret <2 x half> [[C]]
+;
+  %b = fneg <2 x float> %a
+  %c = fptrunc fast <2 x float> %b to <2 x half>
+  ret <2 x half> %c
+}
+
+define <2 x half> @test4_unary_fneg-vec-mixed-fast-2(<2 x float> %a) {
+; CHECK-LABEL: @test4_unary_fneg-vec-mixed-fast-2(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc <2 x float> [[A:%.*]] to <2 x half>
+; CHECK-NEXT:    [[C:%.*]] = fneg <2 x half> [[TMP1]]
+; CHECK-NEXT:    ret <2 x half> [[C]]
+;
+  %b = fneg fast <2 x float> %a
+  %c = fptrunc <2 x float> %b to <2 x half>
+  ret <2 x half> %c
+}
+
 define half @test5(float %a, float %b, float %c) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:    [[D:%.*]] = fcmp ogt float [[A:%.*]], [[B:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/fptrunc.ll b/llvm/test/Transforms/InstCombine/fptrunc.ll
index a4296a326c4bc6..0b5d8b3cd06e07 100644
--- a/llvm/test/Transforms/InstCombine/fptrunc.ll
+++ b/llvm/test/Transforms/InstCombine/fptrunc.ll
@@ -61,6 +61,18 @@ define float @fptrunc_select_true_val(float %x, double %y, i1 %cond) {
   ret float %r
 }
 
+define float @fptrunc_fast_select_true_val(float %x, double %y, i1 %cond) {
+; CHECK-LABEL: @fptrunc_fast_select_true_val(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc fast double [[Y:%.*]] to float
+; CHECK-NEXT:    [[NARROW_SEL:%.*]] = select i1 [[COND:%.*]], float [[TMP1]], float [[X:%.*]]
+; CHECK-NEXT:    ret float [[NARROW_SEL]]
+;
+  %e = fpext float %x to double
+  %sel = select fast i1 %cond, double %y, double %e
+  %r = fptrunc fast double %sel to float
+  ret float %r
+}
+
 define <2 x float> @fptrunc_select_false_val(<2 x float> %x, <2 x double> %y, <2 x i1> %cond) {
 ; CHECK-LABEL: @fptrunc_select_false_val(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc <2 x double> [[Y:%.*]] to <2 x float>
@@ -73,6 +85,18 @@ define <2 x float> @fptrunc_select_false_val(<2 x float> %x, <2 x double> %y, <2
   ret <2 x float> %r
 }
 
+define <2 x float> @fptrunc_nnan_select_false_val(<2 x float> %x, <2 x double> %y, <2 x i1> %cond) {
+; CHECK-LABEL: @fptrunc_nnan_select_false_val(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc nnan <2 x double> [[Y:%.*]] to <2 x float>
+; CHECK-NEXT:    [[NARROW_SEL:%.*]] = select <2 x i1> [[COND:%.*]], <2 x float> [[X:%.*]], <2 x float> [[TMP1]]
+; CHECK-NEXT:    ret <2 x float> [[NARROW_SEL]]
+;
+  %e = fpext <2 x float> %x to <2 x double>
+  %sel = select nnan <2 x i1> %cond, <2 x double> %e, <2 x double> %y
+  %r = fptrunc nnan <2 x double> %sel to <2 x float>
+  ret <2 x float> %r
+}
+
 declare void @use(float)
 
 define half @fptrunc_select_true_val_extra_use(half %x, float %y, i1 %cond) {

>From 0015ffbcbd79b26231237137d5d10c86abdd19cb Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 22 Dec 2023 19:18:24 +0000
Subject: [PATCH 12/18] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20?=
 =?UTF-8?q?initial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.4
---
 llvm/include/llvm/Bitcode/LLVMBitCodes.h      |   1 +
 llvm/include/llvm/IR/Attributes.td            |   4 +
 .../Instrumentation/TypeSanitizer.h           |  38 +
 llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp  |  28 +-
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |   2 +
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     |   2 +
 llvm/lib/CodeGen/ShrinkWrap.cpp               |   1 +
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 llvm/lib/Passes/PassRegistry.def              |   2 +
 .../Transforms/Instrumentation/CMakeLists.txt |   1 +
 .../Instrumentation/TypeSanitizer.cpp         | 873 ++++++++++++++++++
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |   1 +
 .../TypeSanitizer/access-with-offfset.ll      |  71 ++
 .../Instrumentation/TypeSanitizer/alloca.ll   |  29 +
 .../Instrumentation/TypeSanitizer/anon.ll     | 283 ++++++
 .../TypeSanitizer/basic-nosan.ll              |  93 ++
 .../Instrumentation/TypeSanitizer/basic.ll    | 214 +++++
 .../Instrumentation/TypeSanitizer/byval.ll    |  88 ++
 .../Instrumentation/TypeSanitizer/globals.ll  |  66 ++
 .../TypeSanitizer/invalid-metadata.ll         |  25 +
 .../TypeSanitizer/memintrinsics.ll            |  77 ++
 .../TypeSanitizer/nosanitize.ll               |  39 +
 .../TypeSanitizer/sanitize-no-tbaa.ll         | 180 ++++
 .../TypeSanitizer/swifterror.ll               |  24 +
 24 files changed, 2137 insertions(+), 6 deletions(-)
 create mode 100644 llvm/include/llvm/Transforms/Instrumentation/TypeSanitizer.h
 create mode 100644 llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
 create mode 100644 llvm/test/Instrumentation/TypeSanitizer/access-with-offfset.ll
 create mode 100644 llvm/test/Instrumentation/TypeSanitizer/alloca.ll
 create mode 100644 llvm/test/Instrumentation/TypeSanitizer/anon.ll
 create mode 100644 llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll
 create mode 100644 llvm/test/Instrumentation/TypeSanitizer/basic.ll
 create mode 100644 llvm/test/Instrumentation/TypeSanitizer/byval.ll
 create mode 100644 llvm/test/Instrumentation/TypeSanitizer/globals.ll
 create mode 100644 llvm/test/Instrumentation/TypeSanitizer/invalid-metadata.ll
 create mode 100644 llvm/test/Instrumentation/TypeSanitizer/memintrinsics.ll
 create mode 100644 llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll
 create mode 100644 llvm/test/Instrumentation/TypeSanitizer/sanitize-no-tbaa.ll
 create mode 100644 llvm/test/Instrumentation/TypeSanitizer/swifterror.ll

diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 41909a8fc1d590..21fd27d9838db7 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -787,6 +787,7 @@ enum AttributeKindCodes {
   ATTR_KIND_CORO_ELIDE_SAFE = 98,
   ATTR_KIND_NO_EXT = 99,
   ATTR_KIND_NO_DIVERGENCE_SOURCE = 100,
+  ATTR_KIND_SANITIZE_TYPE = 101,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 49f4527bde66e7..179238bc733830 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -317,6 +317,9 @@ def SanitizeAddress : EnumAttr<"sanitize_address", IntersectPreserve, [FnAttr]>;
 /// ThreadSanitizer is on.
 def SanitizeThread : EnumAttr<"sanitize_thread", IntersectPreserve, [FnAttr]>;
 
+/// TypeSanitizer is on.
+def SanitizeType : EnumAttr<"sanitize_type", [FnAttr]>;
+
 /// MemorySanitizer is on.
 def SanitizeMemory : EnumAttr<"sanitize_memory", IntersectPreserve, [FnAttr]>;
 
@@ -425,6 +428,7 @@ class CompatRuleStrAttr<string F, string Attr> : CompatRule<F> {
 
 def : CompatRule<"isEqual<SanitizeAddressAttr>">;
 def : CompatRule<"isEqual<SanitizeThreadAttr>">;
+def : CompatRule<"isEqual<SanitizeTypeAttr>">;
 def : CompatRule<"isEqual<SanitizeMemoryAttr>">;
 def : CompatRule<"isEqual<SanitizeHWAddressAttr>">;
 def : CompatRule<"isEqual<SanitizeMemTagAttr>">;
diff --git a/llvm/include/llvm/Transforms/Instrumentation/TypeSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/TypeSanitizer.h
new file mode 100644
index 00000000000000..a6cc56df35f14d
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Instrumentation/TypeSanitizer.h
@@ -0,0 +1,38 @@
+//===- Transforms/Instrumentation/TypeSanitizer.h - TySan Pass -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the type sanitizer pass.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_TYPESANITIZER_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_TYPESANITIZER_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+class Function;
+class FunctionPass;
+class Module;
+
+/// A function pass for tysan instrumentation.
+struct TypeSanitizerPass : public PassInfoMixin<TypeSanitizerPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+  static bool isRequired() { return true; }
+};
+
+/// A module pass for tysan instrumentation.
+///
+/// Create ctor and init functions.
+struct ModuleTypeSanitizerPass : public PassInfoMixin<ModuleTypeSanitizerPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+  static bool isRequired() { return true; }
+};
+
+} // namespace llvm
+#endif /* LLVM_TRANSFORMS_INSTRUMENTATION_TYPESANITIZER_H */
diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index fd11c3abc379e2..a499e16ff0097f 100644
--- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -372,11 +372,27 @@ static bool isStructPathTBAA(const MDNode *MD) {
   return isa<MDNode>(MD->getOperand(0)) && MD->getNumOperands() >= 3;
 }
 
+// When using the TypeSanitizer, don't use TBAA information for alias analysis.
+// This might cause us to remove memory accesses that we need to verify at
+// runtime.
+static bool usingSanitizeType(const Value *V) {
+  const Function *F;
+
+  if (auto *I = dyn_cast<Instruction>(V))
+    F = I->getParent()->getParent();
+  else if (auto *A = dyn_cast<Argument>(V))
+    F = A->getParent();
+  else
+    return false;
+
+  return F->hasFnAttribute(Attribute::SanitizeType);
+}
+
 AliasResult TypeBasedAAResult::alias(const MemoryLocation &LocA,
                                      const MemoryLocation &LocB,
                                      AAQueryInfo &AAQI, const Instruction *) {
-  if (!EnableTBAA)
-    return AliasResult::MayAlias;
+  if (!EnableTBAA || usingSanitizeType(LocA.Ptr) || usingSanitizeType(LocB.Ptr))
+    return AAResultBase::alias(LocA, LocB, AAQI, nullptr);
 
   if (Aliases(LocA.AATags.TBAA, LocB.AATags.TBAA))
     return AliasResult::MayAlias;
@@ -426,8 +442,8 @@ MemoryEffects TypeBasedAAResult::getMemoryEffects(const Function *F) {
 ModRefInfo TypeBasedAAResult::getModRefInfo(const CallBase *Call,
                                             const MemoryLocation &Loc,
                                             AAQueryInfo &AAQI) {
-  if (!EnableTBAA)
-    return ModRefInfo::ModRef;
+  if (!EnableTBAA || usingSanitizeType(Call))
+    return AAResultBase::getModRefInfo(Call, Loc, AAQI);
 
   if (const MDNode *L = Loc.AATags.TBAA)
     if (const MDNode *M = Call->getMetadata(LLVMContext::MD_tbaa))
@@ -440,8 +456,8 @@ ModRefInfo TypeBasedAAResult::getModRefInfo(const CallBase *Call,
 ModRefInfo TypeBasedAAResult::getModRefInfo(const CallBase *Call1,
                                             const CallBase *Call2,
                                             AAQueryInfo &AAQI) {
-  if (!EnableTBAA)
-    return ModRefInfo::ModRef;
+  if (!EnableTBAA || usingSanitizeType(Call1))
+    return AAResultBase::getModRefInfo(Call1, Call2, AAQI);
 
   if (const MDNode *M1 = Call1->getMetadata(LLVMContext::MD_tbaa))
     if (const MDNode *M2 = Call2->getMetadata(LLVMContext::MD_tbaa))
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 85c6fadeda6cc3..a01ecf0d56642e 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2192,6 +2192,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::SanitizeHWAddress;
   case bitc::ATTR_KIND_SANITIZE_THREAD:
     return Attribute::SanitizeThread;
+  case bitc::ATTR_KIND_SANITIZE_TYPE:
+    return Attribute::SanitizeType;
   case bitc::ATTR_KIND_SANITIZE_MEMORY:
     return Attribute::SanitizeMemory;
   case bitc::ATTR_KIND_SANITIZE_NUMERICAL_STABILITY:
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 0444cb9e1bce5d..b4efd3928a2e6f 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -851,6 +851,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_SANITIZE_HWADDRESS;
   case Attribute::SanitizeThread:
     return bitc::ATTR_KIND_SANITIZE_THREAD;
+  case Attribute::SanitizeType:
+    return bitc::ATTR_KIND_SANITIZE_TYPE;
   case Attribute::SanitizeMemory:
     return bitc::ATTR_KIND_SANITIZE_MEMORY;
   case Attribute::SanitizeNumericalStability:
diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
index 2742437ceb5895..5029f45def2266 100644
--- a/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -986,6 +986,7 @@ bool ShrinkWrap::isShrinkWrapEnabled(const MachineFunction &MF) {
            !(MF.getFunction().hasFnAttribute(Attribute::SanitizeAddress) ||
              MF.getFunction().hasFnAttribute(Attribute::SanitizeThread) ||
              MF.getFunction().hasFnAttribute(Attribute::SanitizeMemory) ||
+             MF.getFunction().hasFnAttribute(Attribute::SanitizeType) ||
              MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress));
   // If EnableShrinkWrap is set, it takes precedence on whatever the
   // target sets. The rational is that we assume we want to test
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 260a34f2e060d6..bf9fd30d905b99 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -225,6 +225,7 @@
 #include "llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h"
 #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
 #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
+#include "llvm/Transforms/Instrumentation/TypeSanitizer.h"
 #include "llvm/Transforms/ObjCARC.h"
 #include "llvm/Transforms/Scalar/ADCE.h"
 #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 772ec5fd10e633..20547895bb2956 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -154,6 +154,7 @@ MODULE_PASS("strip-nonlinetable-debuginfo", StripNonLineTableDebugInfoPass())
 MODULE_PASS("trigger-crash-module", TriggerCrashModulePass())
 MODULE_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
 MODULE_PASS("tsan-module", ModuleThreadSanitizerPass())
+MODULE_PASS("tysan-module", ModuleTypeSanitizerPass())
 MODULE_PASS("verify", VerifierPass())
 MODULE_PASS("view-callgraph", CallGraphViewerPass())
 MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
@@ -477,6 +478,7 @@ FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass())
 FUNCTION_PASS("trigger-crash-function", TriggerCrashFunctionPass())
 FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
+FUNCTION_PASS("tysan", TypeSanitizerPass())
 FUNCTION_PASS("typepromotion", TypePromotionPass(TM))
 FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
 FUNCTION_PASS("vector-combine", VectorCombinePass())
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
index 3e3c3eced4bb9c..5abc7fc8052834 100644
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -24,6 +24,7 @@ add_llvm_component_library(LLVMInstrumentation
   SanitizerBinaryMetadata.cpp
   ValueProfileCollector.cpp
   ThreadSanitizer.cpp
+  TypeSanitizer.cpp
   HWAddressSanitizer.cpp
   RealtimeSanitizer.cpp
 
diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
new file mode 100644
index 00000000000000..ed4aba4ad612d9
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -0,0 +1,873 @@
+//===----- TypeSanitizer.cpp - type-based-aliasing-violation detector -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of TypeSanitizer, a type-based-aliasing-violation
+// detector.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/TypeSanitizer.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#include <cctype>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tysan"
+
+static const char *const kTysanModuleCtorName = "tysan.module_ctor";
+static const char *const kTysanInitName = "__tysan_init";
+static const char *const kTysanCheckName = "__tysan_check";
+static const char *const kTysanGVNamePrefix = "__tysan_v1_";
+
+static const char *const kTysanShadowMemoryAddress =
+    "__tysan_shadow_memory_address";
+static const char *const kTysanAppMemMask = "__tysan_app_memory_mask";
+
+static cl::opt<bool>
+    ClWritesAlwaysSetType("tysan-writes-always-set-type",
+                          cl::desc("Writes always set the type"), cl::Hidden,
+                          cl::init(false));
+
+STATISTIC(NumInstrumentedAccesses, "Number of instrumented accesses");
+
+static Regex AnonNameRegex("^_ZTS.*N[1-9][0-9]*_GLOBAL__N");
+
+namespace {
+
+/// TypeSanitizer: instrument the code in module to find  type-based aliasing
+/// violations.
+struct TypeSanitizer {
+  TypeSanitizer(Module &M);
+  bool run(Function &F, const TargetLibraryInfo &TLI);
+  void instrumentGlobals();
+
+private:
+  typedef SmallDenseMap<const MDNode *, GlobalVariable *, 8>
+      TypeDescriptorsMapTy;
+  typedef SmallDenseMap<const MDNode *, std::string, 8> TypeNameMapTy;
+
+  void initializeCallbacks(Module &M);
+
+  Value *getShadowBase(Function &F);
+  Value *getAppMemMask(Function &F);
+
+  bool instrumentWithShadowUpdate(IRBuilder<> &IRB, const MDNode *TBAAMD,
+                                  Value *Ptr, uint64_t AccessSize, bool IsRead,
+                                  bool IsWrite, Value *&ShadowBase,
+                                  Value *&AppMemMask, bool ForceSetType,
+                                  bool SanitizeFunction,
+                                  TypeDescriptorsMapTy &TypeDescriptors,
+                                  const DataLayout &DL);
+  bool instrumentMemoryAccess(Instruction *I, MemoryLocation &MLoc,
+                              Value *&ShadowBase, Value *&AppMemMask,
+                              bool SanitizeFunction,
+                              TypeDescriptorsMapTy &TypeDescriptors,
+                              const DataLayout &DL);
+  bool instrumentMemInst(Value *I, Value *&ShadowBase, Value *&AppMemMask,
+                         const DataLayout &DL);
+
+  std::string getAnonymousStructIdentifier(const MDNode *MD,
+                                           TypeNameMapTy &TypeNames);
+  bool generateTypeDescriptor(const MDNode *MD,
+                              TypeDescriptorsMapTy &TypeDescriptors,
+                              TypeNameMapTy &TypeNames, Module &M);
+  bool generateBaseTypeDescriptor(const MDNode *MD,
+                                  TypeDescriptorsMapTy &TypeDescriptors,
+                                  TypeNameMapTy &TypeNames, Module &M);
+
+  const Triple TargetTriple;
+  Type *IntptrTy;
+  uint64_t PtrShift;
+  IntegerType *OrdTy;
+
+  // Callbacks to run-time library are computed in doInitialization.
+  Function *TysanCheck;
+  Function *TysanCtorFunction;
+  Function *TysanGlobalsSetTypeFunction;
+};
+} // namespace
+
+TypeSanitizer::TypeSanitizer(Module &M)
+    : TargetTriple(Triple(M.getTargetTriple())) {
+  const DataLayout &DL = M.getDataLayout();
+  IntptrTy = DL.getIntPtrType(M.getContext());
+  PtrShift = countr_zero(IntptrTy->getPrimitiveSizeInBits() / 8);
+
+  TysanGlobalsSetTypeFunction = M.getFunction("__tysan_set_globals_types");
+  initializeCallbacks(M);
+}
+
+void TypeSanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(M.getContext());
+  OrdTy = IRB.getInt32Ty();
+
+  AttributeList Attr;
+  Attr = Attr.addFnAttribute(M.getContext(), Attribute::NoUnwind);
+  // Initialize the callbacks.
+  TysanCheck = cast<Function>(
+      M.getOrInsertFunction(kTysanCheckName, Attr, IRB.getVoidTy(),
+                            IRB.getPtrTy(), // Pointer to data to be read.
+                            OrdTy,              // Size of the data in bytes.
+                            IRB.getPtrTy(), // Pointer to type descriptor.
+                            OrdTy               // Flags.
+                            )
+          .getCallee());
+
+  TysanCtorFunction = cast<Function>(
+      M.getOrInsertFunction(kTysanModuleCtorName, Attr, IRB.getVoidTy())
+          .getCallee());
+}
+
+void TypeSanitizer::instrumentGlobals() {
+  Module &M = *TysanCtorFunction->getParent();
+  initializeCallbacks(M);
+  TysanGlobalsSetTypeFunction = nullptr;
+
+  NamedMDNode *Globals = M.getNamedMetadata("llvm.tysan.globals");
+  if (!Globals)
+    return;
+
+  const DataLayout &DL = M.getDataLayout();
+  Value *ShadowBase = nullptr, *AppMemMask = nullptr;
+  TypeDescriptorsMapTy TypeDescriptors;
+  TypeNameMapTy TypeNames;
+
+  for (const auto &GMD : Globals->operands()) {
+    auto *GV = mdconst::dyn_extract_or_null<GlobalVariable>(GMD->getOperand(0));
+    if (!GV)
+      continue;
+    const MDNode *TBAAMD = cast<MDNode>(GMD->getOperand(1));
+    if (!generateBaseTypeDescriptor(TBAAMD, TypeDescriptors, TypeNames, M))
+      continue;
+
+    if (!TysanGlobalsSetTypeFunction) {
+      TysanGlobalsSetTypeFunction = Function::Create(
+          FunctionType::get(Type::getVoidTy(M.getContext()), false),
+          GlobalValue::InternalLinkage, "__tysan_set_globals_types", &M);
+      BasicBlock *BB =
+          BasicBlock::Create(M.getContext(), "", TysanGlobalsSetTypeFunction);
+      ReturnInst::Create(M.getContext(), BB);
+    }
+
+    IRBuilder<> IRB(
+        TysanGlobalsSetTypeFunction->getEntryBlock().getTerminator());
+    Type *AccessTy = GV->getValueType();
+    assert(AccessTy->isSized());
+    uint64_t AccessSize = DL.getTypeStoreSize(AccessTy);
+    instrumentWithShadowUpdate(IRB, TBAAMD, GV, AccessSize, false, false,
+                               ShadowBase, AppMemMask, true, false,
+                               TypeDescriptors, DL);
+  }
+
+  if (TysanGlobalsSetTypeFunction) {
+    IRBuilder<> IRB(TysanCtorFunction->getEntryBlock().getTerminator());
+    IRB.CreateCall(TysanGlobalsSetTypeFunction, {});
+  }
+}
+
+static void insertModuleCtor(Module &M) {
+  Function *TysanCtorFunction;
+  std::tie(TysanCtorFunction, std::ignore) =
+      createSanitizerCtorAndInitFunctions(M, kTysanModuleCtorName,
+                                          kTysanInitName, /*InitArgTypes=*/{},
+                                          /*InitArgs=*/{});
+
+  TypeSanitizer TySan(M);
+  TySan.instrumentGlobals();
+  appendToGlobalCtors(M, TysanCtorFunction, 0);
+}
+
+static const char LUT[] = "0123456789abcdef";
+
+static std::string encodeName(StringRef Name) {
+  size_t Length = Name.size();
+  std::string Output = kTysanGVNamePrefix;
+  Output.reserve(Output.size() + 3 * Length);
+  for (size_t i = 0; i < Length; ++i) {
+    const unsigned char c = Name[i];
+    if (isalnum((int)c)) {
+      Output.push_back(c);
+      continue;
+    }
+
+    if (c == '_') {
+      Output.append("__");
+      continue;
+    }
+
+    Output.push_back('_');
+    Output.push_back(LUT[c >> 4]);
+    Output.push_back(LUT[c & 15]);
+  }
+
+  return Output;
+}
+
+static bool isAnonymousNamespaceName(StringRef Name) {
+  // Types that are in an anonymous namespace are local to this module.
+  // FIXME: This should really be marked by the frontend in the metadata
+  // instead of having us guess this from the mangled name. Moreover, the regex
+  // here can pick up (unlikely) names in the non-reserved namespace (because
+  // it needs to search into the type to pick up cases where the type in the
+  // anonymous namespace is a template parameter, etc.).
+  return AnonNameRegex.match(Name);
+}
+
+std::string
+TypeSanitizer::getAnonymousStructIdentifier(const MDNode *MD,
+                                            TypeNameMapTy &TypeNames) {
+  MD5 Hash;
+
+  for (int i = 1, e = MD->getNumOperands(); i < e; i += 2) {
+    const MDNode *MemberNode = dyn_cast<MDNode>(MD->getOperand(i));
+    if (!MemberNode)
+      return "";
+
+    auto TNI = TypeNames.find(MemberNode);
+    std::string MemberName;
+    if (TNI != TypeNames.end()) {
+      MemberName = TNI->second;
+    } else {
+      if (MemberNode->getNumOperands() < 1)
+        return "";
+      MDString *MemberNameNode = dyn_cast<MDString>(MemberNode->getOperand(0));
+      if (!MemberNameNode)
+        return "";
+      MemberName = MemberNameNode->getString().str();
+      if (MemberName.empty())
+        MemberName = getAnonymousStructIdentifier(MemberNode, TypeNames);
+      if (MemberName.empty())
+        return "";
+      TypeNames[MemberNode] = MemberName;
+    }
+
+    Hash.update(MemberName);
+    Hash.update("\0");
+
+    uint64_t Offset =
+        mdconst::extract<ConstantInt>(MD->getOperand(i + 1))->getZExtValue();
+    Hash.update(utostr(Offset));
+    Hash.update("\0");
+  }
+
+  MD5::MD5Result HashResult;
+  Hash.final(HashResult);
+  return "__anonymous_" + std::string(HashResult.digest().str());
+}
+
+bool TypeSanitizer::generateBaseTypeDescriptor(
+    const MDNode *MD, TypeDescriptorsMapTy &TypeDescriptors,
+    TypeNameMapTy &TypeNames, Module &M) {
+  if (MD->getNumOperands() < 1)
+    return false;
+
+  MDString *NameNode = dyn_cast<MDString>(MD->getOperand(0));
+  if (!NameNode)
+    return false;
+
+  std::string Name = NameNode->getString().str();
+  if (Name.empty())
+    Name = getAnonymousStructIdentifier(MD, TypeNames);
+  if (Name.empty())
+    return false;
+  TypeNames[MD] = Name;
+  std::string EncodedName = encodeName(Name);
+
+  GlobalVariable *GV =
+      dyn_cast_or_null<GlobalVariable>(M.getNamedValue(EncodedName));
+  if (GV) {
+    TypeDescriptors[MD] = GV;
+    return true;
+  }
+
+  SmallVector<std::pair<Constant *, uint64_t>> Members;
+  for (int i = 1, e = MD->getNumOperands(); i < e; i += 2) {
+    const MDNode *MemberNode = dyn_cast<MDNode>(MD->getOperand(i));
+    if (!MemberNode)
+      return false;
+
+    Constant *Member;
+    auto TDI = TypeDescriptors.find(MemberNode);
+    if (TDI != TypeDescriptors.end()) {
+      Member = TDI->second;
+    } else {
+      if (!generateBaseTypeDescriptor(MemberNode, TypeDescriptors, TypeNames,
+                                      M))
+        return false;
+
+      Member = TypeDescriptors[MemberNode];
+    }
+
+    uint64_t Offset =
+        mdconst::extract<ConstantInt>(MD->getOperand(i + 1))->getZExtValue();
+
+    Members.push_back(std::make_pair(Member, Offset));
+  }
+
+  // The descriptor for a scalar is:
+  //   [2, member count, [type pointer, offset]..., name]
+
+  LLVMContext &C = MD->getContext();
+  Constant *NameData = ConstantDataArray::getString(C, NameNode->getString());
+  SmallVector<Type *> TDSubTys;
+  SmallVector<Constant *> TDSubData;
+
+  TDSubTys.push_back(IntptrTy);
+  TDSubData.push_back(ConstantInt::get(IntptrTy, 2));
+
+  TDSubTys.push_back(IntptrTy);
+  TDSubData.push_back(ConstantInt::get(IntptrTy, Members.size()));
+
+  bool ShouldBeComdat = !isAnonymousNamespaceName(NameNode->getString());
+  for (auto &Member : Members) {
+    TDSubTys.push_back(Member.first->getType());
+    TDSubData.push_back(Member.first);
+
+    TDSubTys.push_back(IntptrTy);
+    TDSubData.push_back(ConstantInt::get(IntptrTy, Member.second));
+  }
+
+  TDSubTys.push_back(NameData->getType());
+  TDSubData.push_back(NameData);
+
+  StructType *TDTy = StructType::get(C, TDSubTys);
+  Constant *TD = ConstantStruct::get(TDTy, TDSubData);
+
+  GlobalVariable *TDGV =
+      new GlobalVariable(TDTy, true,
+                         !ShouldBeComdat ? GlobalValue::InternalLinkage
+                                         : GlobalValue::LinkOnceODRLinkage,
+                         TD, EncodedName);
+  M.insertGlobalVariable(TDGV);
+
+  if (ShouldBeComdat) {
+    if (TargetTriple.isOSBinFormatELF()) {
+      Comdat *TDComdat = M.getOrInsertComdat(EncodedName);
+      TDGV->setComdat(TDComdat);
+    }
+    appendToUsed(M, TDGV);
+  }
+
+  TypeDescriptors[MD] = TDGV;
+  return true;
+}
+
+bool TypeSanitizer::generateTypeDescriptor(
+    const MDNode *MD, TypeDescriptorsMapTy &TypeDescriptors,
+    TypeNameMapTy &TypeNames, Module &M) {
+  // Here we need to generate a type descriptor corresponding to this TBAA
+  // metadata node. Under the current scheme there are three kinds of TBAA
+  // metadata nodes: scalar nodes, struct nodes, and struct tag nodes.
+
+  if (MD->getNumOperands() < 3)
+    return false;
+
+  const MDNode *BaseNode = dyn_cast<MDNode>(MD->getOperand(0));
+  if (!BaseNode)
+    return false;
+
+  // This is a struct tag (element-access) node.
+
+  const MDNode *AccessNode = dyn_cast<MDNode>(MD->getOperand(1));
+  if (!AccessNode)
+    return false;
+
+  Constant *Base;
+  auto TDI = TypeDescriptors.find(BaseNode);
+  if (TDI != TypeDescriptors.end()) {
+    Base = TDI->second;
+  } else {
+    if (!generateBaseTypeDescriptor(BaseNode, TypeDescriptors, TypeNames, M))
+      return false;
+
+    Base = TypeDescriptors[BaseNode];
+  }
+
+  Constant *Access;
+  TDI = TypeDescriptors.find(AccessNode);
+  if (TDI != TypeDescriptors.end()) {
+    Access = TDI->second;
+  } else {
+    if (!generateBaseTypeDescriptor(AccessNode, TypeDescriptors, TypeNames, M))
+      return false;
+
+    Access = TypeDescriptors[AccessNode];
+  }
+
+  uint64_t Offset =
+      mdconst::extract<ConstantInt>(MD->getOperand(2))->getZExtValue();
+  std::string EncodedName =
+      std::string(Base->getName()) + "_o_" + utostr(Offset);
+
+  GlobalVariable *GV =
+      dyn_cast_or_null<GlobalVariable>(M.getNamedValue(EncodedName));
+  if (GV) {
+    TypeDescriptors[MD] = GV;
+    return true;
+  }
+
+  // The descriptor for a scalar is:
+  //   [1, base-type pointer, access-type pointer, offset]
+
+  StructType *TDTy =
+      StructType::get(IntptrTy, Base->getType(), Access->getType(), IntptrTy);
+  Constant *TD =
+      ConstantStruct::get(TDTy, ConstantInt::get(IntptrTy, 1), Base, Access,
+                          ConstantInt::get(IntptrTy, Offset));
+
+  bool ShouldBeComdat = cast<GlobalVariable>(Base)->getLinkage() ==
+                        GlobalValue::LinkOnceODRLinkage;
+
+  GlobalVariable *TDGV =
+      new GlobalVariable(TDTy, true,
+                         !ShouldBeComdat ? GlobalValue::InternalLinkage
+                                         : GlobalValue::LinkOnceODRLinkage,
+                         TD, EncodedName);
+  M.insertGlobalVariable(TDGV);
+
+  if (ShouldBeComdat) {
+    if (TargetTriple.isOSBinFormatELF()) {
+      Comdat *TDComdat = M.getOrInsertComdat(EncodedName);
+      TDGV->setComdat(TDComdat);
+    }
+    appendToUsed(M, TDGV);
+  }
+
+  TypeDescriptors[MD] = TDGV;
+  return true;
+}
+
+Value *TypeSanitizer::getShadowBase(Function &F) {
+  IRBuilder<> IRB(&F.front().front());
+  Constant *GlobalShadowAddress =
+      F.getParent()->getOrInsertGlobal(kTysanShadowMemoryAddress, IntptrTy);
+  return IRB.CreateLoad(IntptrTy, GlobalShadowAddress, "shadow.base");
+}
+
+Value *TypeSanitizer::getAppMemMask(Function &F) {
+  IRBuilder<> IRB(&F.front().front());
+  Value *GlobalAppMemMask =
+      F.getParent()->getOrInsertGlobal(kTysanAppMemMask, IntptrTy);
+  return IRB.CreateLoad(IntptrTy, GlobalAppMemMask, "app.mem.mask");
+}
+
+bool TypeSanitizer::run(Function &F, const TargetLibraryInfo &TLI) {
+  // This is required to prevent instrumenting call to __tysan_init from within
+  // the module constructor.
+  if (&F == TysanCtorFunction || &F == TysanGlobalsSetTypeFunction)
+    return false;
+  initializeCallbacks(*F.getParent());
+
+  SmallVector<std::pair<Instruction *, MemoryLocation>> MemoryAccesses;
+  SmallSetVector<const MDNode *, 8> TBAAMetadata;
+  SmallVector<Value *> MemTypeResetInsts;
+
+  bool Res = false;
+  bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeType);
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  // Traverse all instructions, collect loads/stores/returns, check for calls.
+  for (auto &BB : F) {
+    for (auto &Inst : BB) {
+      // Skip memory accesses inserted by another instrumentation.
+      if (Inst.getMetadata(LLVMContext::MD_nosanitize))
+        continue;
+
+      if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst) ||
+          isa<AtomicCmpXchgInst>(Inst) || isa<AtomicRMWInst>(Inst)) {
+        MemoryLocation MLoc = MemoryLocation::get(&Inst);
+
+        // Swift errors are special (we can't introduce extra uses on them).
+        if (MLoc.Ptr->isSwiftError())
+          continue;
+
+        // Skip non-address-space-0 pointers; we don't know how to handle them.
+        Type *PtrTy = cast<PointerType>(MLoc.Ptr->getType());
+        if (PtrTy->getPointerAddressSpace() != 0)
+          continue;
+
+        if (MLoc.AATags.TBAA)
+          TBAAMetadata.insert(MLoc.AATags.TBAA);
+        MemoryAccesses.push_back(std::make_pair(&Inst, MLoc));
+      } else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
+        if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+          maybeMarkSanitizerLibraryCallNoBuiltin(CI, &TLI);
+
+        if (isa<MemIntrinsic>(Inst)) {
+          MemTypeResetInsts.push_back(&Inst);
+        } else if (auto *II = dyn_cast<IntrinsicInst>(&Inst)) {
+          if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+              II->getIntrinsicID() == Intrinsic::lifetime_end)
+            MemTypeResetInsts.push_back(&Inst);
+        }
+      } else if (isa<AllocaInst>(Inst)) {
+        MemTypeResetInsts.push_back(&Inst);
+      }
+    }
+  }
+
+  // byval arguments also need their types reset (they're new stack memory,
+  // just like allocas).
+  for (auto &A : F.args())
+    if (A.hasByValAttr())
+      MemTypeResetInsts.push_back(&A);
+
+  // We have collected all loads and stores, and know for what TBAA nodes we
+  // need to generate type descriptors.
+
+  Module &M = *F.getParent();
+  TypeDescriptorsMapTy TypeDescriptors;
+  TypeNameMapTy TypeNames;
+  for (const MDNode *MD : TBAAMetadata) {
+    if (TypeDescriptors.count(MD))
+      continue;
+
+    if (!generateTypeDescriptor(MD, TypeDescriptors, TypeNames, M))
+      return Res; // Giving up.
+
+    Res = true;
+  }
+
+  Value *ShadowBase = nullptr, *AppMemMask = nullptr;
+  for (auto &MA : MemoryAccesses)
+    Res |= instrumentMemoryAccess(MA.first, MA.second, ShadowBase, AppMemMask,
+                                  SanitizeFunction, TypeDescriptors, DL);
+
+  for (auto Inst : MemTypeResetInsts)
+    Res |= instrumentMemInst(Inst, ShadowBase, AppMemMask, DL);
+
+  return Res;
+}
+
+static Value *ConvertToShadowDataInt(IRBuilder<> &IRB, Value *Ptr,
+                                     Type *IntptrTy, uint64_t PtrShift,
+                                     Value *ShadowBase, Value *AppMemMask) {
+  return IRB.CreateAdd(
+      IRB.CreateShl(
+          IRB.CreateAnd(IRB.CreatePtrToInt(Ptr, IntptrTy, "app.ptr.int"),
+                        AppMemMask, "app.ptr.masked"),
+          PtrShift, "app.ptr.shifted"),
+      ShadowBase, "shadow.ptr.int");
+}
+
+bool TypeSanitizer::instrumentWithShadowUpdate(
+    IRBuilder<> &IRB, const MDNode *TBAAMD, Value *Ptr, uint64_t AccessSize,
+    bool IsRead, bool IsWrite, Value *&ShadowBase, Value *&AppMemMask,
+    bool ForceSetType, bool SanitizeFunction,
+    TypeDescriptorsMapTy &TypeDescriptors, const DataLayout &DL) {
+  if (!ShadowBase)
+    ShadowBase = getShadowBase(*IRB.GetInsertBlock()->getParent());
+  if (!AppMemMask)
+    AppMemMask = getAppMemMask(*IRB.GetInsertBlock()->getParent());
+
+  Constant *TDGV;
+  if (TBAAMD)
+    TDGV = TypeDescriptors[TBAAMD];
+  else
+    TDGV = Constant::getNullValue(IRB.getPtrTy());
+
+  Value *TD = IRB.CreateBitCast(TDGV, IRB.getPtrTy());
+
+  Value *ShadowDataInt = ConvertToShadowDataInt(IRB, Ptr, IntptrTy, PtrShift,
+                                                ShadowBase, AppMemMask);
+  Type *Int8PtrPtrTy = IRB.getPtrTy()->getPointerTo();
+  Value *ShadowData =
+      IRB.CreateIntToPtr(ShadowDataInt, Int8PtrPtrTy, "shadow.ptr");
+
+  auto SetType = [&]() {
+    IRB.CreateStore(TD, ShadowData);
+
+    // Now fill the remainder of the shadow memory corresponding to the
+    // remainder of the the bytes of the type with a bad type descriptor.
+    for (uint64_t i = 1; i < AccessSize; ++i) {
+      Value *BadShadowData = IRB.CreateIntToPtr(
+          IRB.CreateAdd(ShadowDataInt,
+                        ConstantInt::get(IntptrTy, i << PtrShift),
+                        "shadow.byte." + Twine(i) + ".offset"),
+          Int8PtrPtrTy, "shadow.byte." + Twine(i) + ".ptr");
+
+      // This is the TD value, -i, which is used to indicate that the byte is
+      // i bytes after the first byte of the type.
+      Value *BadTD =
+          IRB.CreateIntToPtr(ConstantInt::getSigned(IntptrTy, -i),
+                             IRB.getPtrTy(), "bad.descriptor" + Twine(i));
+      IRB.CreateStore(BadTD, BadShadowData);
+    }
+  };
+
+  if (!ForceSetType && (!ClWritesAlwaysSetType || IsRead)) {
+    // We need to check the type here. If the type is unknown, then the read
+    // sets the type. If the type is known, then it is checked. If the type
+    // doesn't match, then we call the runtime (which may yet determine that
+    // the mismatch is okay).
+    LLVMContext &C = IRB.getContext();
+    MDNode *UnlikelyBW = MDBuilder(C).createBranchWeights(1, 100000);
+
+    Constant *Flags =
+        ConstantInt::get(OrdTy, (int)IsRead | (((int)IsWrite) << 1));
+
+    Value *LoadedTD =
+        IRB.CreateLoad(IRB.getPtrTy(), ShadowData, "shadow.desc");
+    if (SanitizeFunction) {
+      Value *BadTDCmp = IRB.CreateICmpNE(LoadedTD, TD, "bad.desc");
+      Instruction *BadTDTerm, *GoodTDTerm;
+      SplitBlockAndInsertIfThenElse(BadTDCmp, &*IRB.GetInsertPoint(),
+                                    &BadTDTerm, &GoodTDTerm, UnlikelyBW);
+      IRB.SetInsertPoint(BadTDTerm);
+
+      // We now know that the types did not match (we're on the slow path). If
+      // the type is unknown, then set it.
+      Value *NullTDCmp = IRB.CreateIsNull(LoadedTD);
+      Instruction *NullTDTerm, *MismatchTerm;
+      SplitBlockAndInsertIfThenElse(NullTDCmp, &*IRB.GetInsertPoint(),
+                                    &NullTDTerm, &MismatchTerm);
+
+      // If the type is unknown, then set the type.
+      IRB.SetInsertPoint(NullTDTerm);
+
+      // We're about to set the type. Make sure that all bytes in the value are
+      // also of unknown type.
+      Value *Size = ConstantInt::get(OrdTy, AccessSize);
+      Value *NotAllUnkTD = IRB.getFalse();
+      for (uint64_t i = 1; i < AccessSize; ++i) {
+        Value *UnkShadowData = IRB.CreateIntToPtr(
+            IRB.CreateAdd(ShadowDataInt,
+                          ConstantInt::get(IntptrTy, i << PtrShift)),
+            Int8PtrPtrTy);
+        Value *ILdTD = IRB.CreateLoad(IRB.getPtrTy(), UnkShadowData);
+        NotAllUnkTD = IRB.CreateOr(NotAllUnkTD, IRB.CreateIsNotNull(ILdTD));
+      }
+
+      Instruction *BeforeSetType = &*IRB.GetInsertPoint();
+      Instruction *BadUTDTerm = SplitBlockAndInsertIfThen(
+          NotAllUnkTD, BeforeSetType, false, UnlikelyBW);
+      IRB.SetInsertPoint(BadUTDTerm);
+      IRB.CreateCall(TysanCheck, {IRB.CreateBitCast(Ptr, IRB.getPtrTy()),
+                                  Size, (Value *)TD, (Value *)Flags});
+
+      IRB.SetInsertPoint(BeforeSetType);
+      SetType();
+
+      // We have a non-trivial mismatch. Call the runtime.
+      IRB.SetInsertPoint(MismatchTerm);
+      IRB.CreateCall(TysanCheck, {IRB.CreateBitCast(Ptr, IRB.getPtrTy()),
+                                  Size, (Value *)TD, (Value *)Flags});
+
+      // We appear to have the right type. Make sure that all other bytes in
+      // the type are still marked as interior bytes. If not, call the runtime.
+      IRB.SetInsertPoint(GoodTDTerm);
+      Value *NotAllBadTD = IRB.getFalse();
+      for (uint64_t i = 1; i < AccessSize; ++i) {
+        Value *BadShadowData = IRB.CreateIntToPtr(
+            IRB.CreateAdd(ShadowDataInt,
+                          ConstantInt::get(IntptrTy, i << PtrShift)),
+            Int8PtrPtrTy);
+        Value *ILdTD = IRB.CreatePtrToInt(
+            IRB.CreateLoad(IRB.getPtrTy(), BadShadowData), IntptrTy);
+        NotAllBadTD = IRB.CreateOr(
+            NotAllBadTD,
+            IRB.CreateICmpSGE(ILdTD, ConstantInt::get(IntptrTy, 0)));
+      }
+
+      Instruction *BadITDTerm = SplitBlockAndInsertIfThen(
+          NotAllBadTD, &*IRB.GetInsertPoint(), false, UnlikelyBW);
+      IRB.SetInsertPoint(BadITDTerm);
+      IRB.CreateCall(TysanCheck, {IRB.CreateBitCast(Ptr, IRB.getPtrTy()),
+                                  Size, (Value *)TD, (Value *)Flags});
+    } else {
+      // If we're not sanitizing this function, then we only care whether we
+      // need to *set* the type.
+      Value *NullTDCmp = IRB.CreateIsNull(LoadedTD, "desc.set");
+      Instruction *NullTDTerm = SplitBlockAndInsertIfThen(
+          NullTDCmp, &*IRB.GetInsertPoint(), false, UnlikelyBW);
+      IRB.SetInsertPoint(NullTDTerm);
+      NullTDTerm->getParent()->setName("set.type");
+      SetType();
+    }
+  } else if (ForceSetType || IsWrite) {
+    // In the mode where writes always set the type, for a write (which does
+    // not also read), we just set the type.
+    SetType();
+  }
+
+  return true;
+}
+
+bool TypeSanitizer::instrumentMemoryAccess(
+    Instruction *I, MemoryLocation &MLoc, Value *&ShadowBase,
+    Value *&AppMemMask, bool SanitizeFunction,
+    TypeDescriptorsMapTy &TypeDescriptors, const DataLayout &DL) {
+  IRBuilder<> IRB(I);
+  assert(MLoc.Size.isPrecise());
+  if (instrumentWithShadowUpdate(
+          IRB, MLoc.AATags.TBAA, const_cast<Value *>(MLoc.Ptr),
+          MLoc.Size.getValue(), I->mayReadFromMemory(), I->mayWriteToMemory(),
+          ShadowBase, AppMemMask, false, SanitizeFunction, TypeDescriptors,
+          DL)) {
+    ++NumInstrumentedAccesses;
+    return true;
+  }
+
+  return false;
+}
+
+// Memory-related intrinsics/instructions reset the type of the destination
+// memory (including allocas and byval arguments).
+bool TypeSanitizer::instrumentMemInst(Value *V, Value *&ShadowBase,
+                                      Value *&AppMemMask,
+                                      const DataLayout &DL) {
+  BasicBlock::iterator IP;
+  BasicBlock *BB;
+  Function *F;
+
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    IP = BasicBlock::iterator(I);
+    BB = I->getParent();
+    F = BB->getParent();
+  } else {
+    auto *A = cast<Argument>(V);
+    F = A->getParent();
+    BB = &F->getEntryBlock();
+    IP = BB->getFirstInsertionPt();
+
+    if (auto *I = cast_or_null<Instruction>(ShadowBase)) {
+      if (IP->comesBefore(I))
+        IP = I->getNextNode()->getIterator();
+    }
+    if (auto *I = cast_or_null<Instruction>(AppMemMask)) {
+      if (IP->comesBefore(I))
+        IP = I->getNextNode()->getIterator();
+    }
+  }
+
+  Value *Dest, *Size, *Src = nullptr;
+  bool NeedsMemMove = false;
+  IRBuilder<> IRB(BB, IP);
+
+  if (auto *A = dyn_cast<Argument>(V)) {
+    assert(A->hasByValAttr() && "Type reset for non-byval argument?");
+
+    Dest = A;
+    Size =
+        ConstantInt::get(IntptrTy, DL.getTypeAllocSize(A->getParamByValType()));
+  } else {
+    auto *I = cast<Instruction>(V);
+    if (auto *MI = dyn_cast<MemIntrinsic>(I)) {
+      if (MI->getDestAddressSpace() != 0)
+        return false;
+
+      Dest = MI->getDest();
+      Size = MI->getLength();
+
+      if (auto *MTI = dyn_cast<MemTransferInst>(MI)) {
+        if (MTI->getSourceAddressSpace() == 0) {
+          Src = MTI->getSource();
+          NeedsMemMove = isa<MemMoveInst>(MTI);
+        }
+      }
+    } else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+          II->getIntrinsicID() != Intrinsic::lifetime_end)
+        return false;
+
+      Size = II->getArgOperand(0);
+      Dest = II->getArgOperand(1);
+    } else if (auto *AI = dyn_cast<AllocaInst>(I)) {
+      // We need to clear the types for new stack allocations (or else we might
+      // read stale type information from a previous function execution).
+
+      IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(I)));
+      IRB.SetInstDebugLocation(I);
+
+      Size = IRB.CreateMul(
+          IRB.CreateZExtOrTrunc(AI->getArraySize(), IntptrTy),
+          ConstantInt::get(IntptrTy,
+                           DL.getTypeAllocSize(AI->getAllocatedType())));
+      Dest = I;
+    } else {
+      return false;
+    }
+  }
+
+  if (!ShadowBase)
+    ShadowBase = getShadowBase(*F);
+  if (!AppMemMask)
+    AppMemMask = getAppMemMask(*F);
+
+  Value *ShadowDataInt = IRB.CreateAdd(
+      IRB.CreateShl(
+          IRB.CreateAnd(IRB.CreatePtrToInt(Dest, IntptrTy), AppMemMask),
+          PtrShift),
+      ShadowBase);
+  Value *ShadowData = IRB.CreateIntToPtr(ShadowDataInt, IRB.getPtrTy());
+
+  if (!Src) {
+    IRB.CreateMemSet(ShadowData, IRB.getInt8(0), IRB.CreateShl(Size, PtrShift),
+                     Align(1u << PtrShift));
+    return true;
+  }
+
+  Value *SrcShadowDataInt = IRB.CreateAdd(
+      IRB.CreateShl(
+          IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask),
+          PtrShift),
+      ShadowBase);
+  Value *SrcShadowData =
+      IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy());
+
+  if (NeedsMemMove) {
+    IRB.CreateMemMove(ShadowData, Align(1u << PtrShift), SrcShadowData,
+                      Align(1u << PtrShift), IRB.CreateShl(Size, PtrShift));
+  } else {
+    IRB.CreateMemCpy(ShadowData, Align(1u << PtrShift), SrcShadowData,
+                     Align(1u << PtrShift), IRB.CreateShl(Size, PtrShift));
+  }
+
+  return true;
+}
+
+PreservedAnalyses TypeSanitizerPass::run(Function &F,
+                                         FunctionAnalysisManager &FAM) {
+  TypeSanitizer TySan(*F.getParent());
+  TySan.run(F, FAM.getResult<TargetLibraryAnalysis>(F));
+  return PreservedAnalyses::none();
+}
+
+PreservedAnalyses ModuleTypeSanitizerPass::run(Module &M,
+                                               ModuleAnalysisManager &AM) {
+  insertModuleCtor(M);
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 6539f924c2edf4..610a77bc4c31ec 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -935,6 +935,7 @@ Function *CodeExtractor::constructFunctionDeclaration(
       case Attribute::SanitizeMemory:
       case Attribute::SanitizeNumericalStability:
       case Attribute::SanitizeThread:
+      case Attribute::SanitizeType:
       case Attribute::SanitizeHWAddress:
       case Attribute::SanitizeMemTag:
       case Attribute::SanitizeRealtime:
diff --git a/llvm/test/Instrumentation/TypeSanitizer/access-with-offfset.ll b/llvm/test/Instrumentation/TypeSanitizer/access-with-offfset.ll
new file mode 100644
index 00000000000000..297ee83527b5c1
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/access-with-offfset.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+
+;.
+; CHECK: @[[LLVM_GLOBAL_CTORS:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
+; CHECK: @[[__TYSAN_V1_SIMPLE_20C_2FC_2B_2B_20TBAA:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, [18 x i8] } { i64 2, i64 0, [18 x i8] c"Simple C/C++ TBAA\00" }, comdat
+; CHECK: @[[__TYSAN_V1_OMNIPOTENT_20CHAR:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, ptr, i64, [16 x i8] } { i64 2, i64 1, ptr @__tysan_v1_Simple_20C_2fC_2b_2b_20TBAA, i64 0, [16 x i8] c"omnipotent char\00" }, comdat
+; CHECK: @[[__TYSAN_V1_ANY_20POINTER:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, ptr, i64, [12 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [12 x i8] c"any pointer\00" }, comdat
+; CHECK: @[[__TYSAN_V1_ANY_20POINTER_O_0:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1_any_20pointer, ptr @__tysan_v1_any_20pointer, i64 0 }, comdat
+; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2fC_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_any_20pointer, ptr @__tysan_v1_any_20pointer_o_0], section "llvm.metadata"
+; CHECK: @[[__TYSAN_SHADOW_MEMORY_ADDRESS:[a-zA-Z0-9_$"\\.-]+]] = external global i64
+; CHECK: @[[__TYSAN_APP_MEMORY_MASK:[a-zA-Z0-9_$"\\.-]+]] = external global i64
+;.
+define ptr @test_load_offset(ptr %argv) {
+; CHECK-LABEL: @test_load_offset(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 4
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 4
+; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 0, [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[DESC_SET:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[DESC_SET]], label [[SET_TYPE:%.*]], label [[TMP0:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       set.type:
+; CHECK-NEXT:    store ptr @__tysan_v1_any_20pointer_o_0, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_4_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 32
+; CHECK-NEXT:    [[SHADOW_BYTE_4_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_5_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 40
+; CHECK-NEXT:    [[SHADOW_BYTE_5_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_6_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 48
+; CHECK-NEXT:    [[SHADOW_BYTE_6_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_7_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 56
+; CHECK-NEXT:    [[SHADOW_BYTE_7_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR]], align 8
+; CHECK-NEXT:    br label [[TMP0]]
+; CHECK:       0:
+; CHECK-NEXT:    [[L:%.*]] = load ptr, ptr null, align 8, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    ret ptr [[L]]
+;
+entry:
+  %l = load ptr, ptr null, align 8, !tbaa !0
+  ret ptr %l
+}
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"any pointer", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind }
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 100000}
+; CHECK: [[TBAA1]] = !{!2, !2, i64 0}
+; CHECK: [[META2:![0-9]+]] = !{!"any pointer", !3, i64 0}
+; CHECK: [[META3:![0-9]+]] = !{!"omnipotent char", !4, i64 0}
+; CHECK: [[META4:![0-9]+]] = !{!"Simple C/C++ TBAA"}
+;.
diff --git a/llvm/test/Instrumentation/TypeSanitizer/alloca.ll b/llvm/test/Instrumentation/TypeSanitizer/alloca.ll
new file mode 100644
index 00000000000000..94098bd8a1739a
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/alloca.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Test basic type sanitizer instrumentation.
+;
+; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare void @alloca_test_use(ptr)
+
+define void @alloca_test() sanitize_type {
+; CHECK-LABEL: @alloca_test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[X:%.*]] = alloca [10 x i8], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[X]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP4]], i8 0, i64 80, i1 false)
+; CHECK-NEXT:    call void @alloca_test_use(ptr [[X]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x = alloca [10 x i8], align 1
+  call void @alloca_test_use([10 x i8]* %x)
+  ret void
+}
diff --git a/llvm/test/Instrumentation/TypeSanitizer/anon.ll b/llvm/test/Instrumentation/TypeSanitizer/anon.ll
new file mode 100644
index 00000000000000..70f5dcefde64c1
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/anon.ll
@@ -0,0 +1,283 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; Test basic type sanitizer instrumentation.
+;
+; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK: @[[LLVM_GLOBAL_CTORS:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
+; CHECK: @[[__TYSAN_V1_SIMPLE_20C_2B_2B_20TBAA:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, [16 x i8] } { i64 2, i64 0, [16 x i8] c"Simple C++ TBAA\00" }, comdat
+; CHECK: @[[__TYSAN_V1_OMNIPOTENT_20CHAR:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, ptr, i64, [16 x i8] } { i64 2, i64 1, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, i64 0, [16 x i8] c"omnipotent char\00" }, comdat
+; CHECK: @[[__TYSAN_V1_INT:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, ptr, i64, [4 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [4 x i8] c"int\00" }, comdat
+; CHECK: @[[__TYSAN_V1___ZTSN12__GLOBAL____N__11ZE:[a-zA-Z0-9_$"\\.-]+]] = internal constant { i64, i64, ptr, i64, [23 x i8] } { i64 2, i64 1, ptr @__tysan_v1_int, i64 24, [23 x i8] c"_ZTSN12_GLOBAL__N_11zE\00" }
+; CHECK: @[[__TYSAN_V1___ZTSN12__GLOBAL____N__11ZE_O_24:[a-zA-Z0-9_$"\\.-]+]] = internal constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1___ZTSN12__GLOBAL____N__11zE, ptr @__tysan_v1_int, i64 24 }
+; CHECK: @[[__TYSAN_V1___ZTS1YIN12__GLOBAL____N__11ZEE:[a-zA-Z0-9_$"\\.-]+]] = internal constant { i64, i64, ptr, i64, [27 x i8] } { i64 2, i64 1, ptr @__tysan_v1_int, i64 24, [27 x i8] c"_ZTS1yIN12_GLOBAL__N_11zEE\00" }
+; CHECK: @[[__TYSAN_V1___ZTS1YIN12__GLOBAL____N__11ZEE_O_24:[a-zA-Z0-9_$"\\.-]+]] = internal constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1___ZTS1yIN12__GLOBAL____N__11zEE, ptr @__tysan_v1_int, i64 24 }
+; CHECK: @[[__TYSAN_SHADOW_MEMORY_ADDRESS:[a-zA-Z0-9_$"\\.-]+]] = external global i64
+; CHECK: @[[__TYSAN_APP_MEMORY_MASK:[a-zA-Z0-9_$"\\.-]+]] = external global i64
+; CHECK: @[[__TYSAN_V1_____ANONYMOUS__027D9E575C5D34CB5D60D6A1D6276F95:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, ptr, i64, [1 x i8] } { i64 2, i64 1, ptr @__tysan_v1_int, i64 24, [1 x i8] zeroinitializer }, comdat
+; CHECK: @[[__TYSAN_V1_____ANONYMOUS__027D9E575C5D34CB5D60D6A1D6276F95_O_24:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1_____anonymous__027d9e575c5d34cb5d60d6a1d6276f95, ptr @__tysan_v1_int, i64 24 }, comdat
+; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [6 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_____anonymous__027d9e575c5d34cb5d60d6a1d6276f95, ptr @__tysan_v1_____anonymous__027d9e575c5d34cb5d60d6a1d6276f95_o_24], section "llvm.metadata"
+
+
+define void @test_anon_ns(ptr %a, ptr %b) sanitize_type {
+; CHECK-LABEL: @test_anon_ns(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], @__tysan_v1___ZTSN12__GLOBAL____N__11zE_o_24
+; CHECK-NEXT:    br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP22:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne ptr [[TMP5]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 false, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne ptr [[TMP10]], null
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null
+; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTSN12__GLOBAL____N__11zE_o_24, i32 2)
+; CHECK-NEXT:    br label [[TMP19]]
+; CHECK:       19:
+; CHECK-NEXT:    store ptr @__tysan_v1___ZTSN12__GLOBAL____N__11zE_o_24, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
+; CHECK-NEXT:    br label [[TMP21:%.*]]
+; CHECK:       20:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTSN12__GLOBAL____N__11zE_o_24, i32 2)
+; CHECK-NEXT:    br label [[TMP21]]
+; CHECK:       21:
+; CHECK-NEXT:    br label [[TMP43:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+; CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp sge i64 [[TMP26]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = or i1 false, [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+; CHECK-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[TMP30]], align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP31]] to i64
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp sge i64 [[TMP32]], 0
+; CHECK-NEXT:    [[TMP34:%.*]] = or i1 [[TMP28]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+; CHECK-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[TMP36]], align 8
+; CHECK-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP37]] to i64
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp sge i64 [[TMP38]], 0
+; CHECK-NEXT:    [[TMP40:%.*]] = or i1 [[TMP34]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[TMP40]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF0]]
+; CHECK:       41:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTSN12__GLOBAL____N__11zE_o_24, i32 2)
+; CHECK-NEXT:    br label [[TMP42]]
+; CHECK:       42:
+; CHECK-NEXT:    br label [[TMP43]]
+; CHECK:       43:
+; CHECK-NEXT:    store i32 42, ptr [[A]], align 4, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    [[APP_PTR_INT1:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; CHECK-NEXT:    [[APP_PTR_MASKED2:%.*]] = and i64 [[APP_PTR_INT1]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED3:%.*]] = shl i64 [[APP_PTR_MASKED2]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT4:%.*]] = add i64 [[APP_PTR_SHIFTED3]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[SHADOW_PTR5:%.*]] = inttoptr i64 [[SHADOW_PTR_INT4]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC6:%.*]] = load ptr, ptr [[SHADOW_PTR5]], align 8
+; CHECK-NEXT:    [[BAD_DESC7:%.*]] = icmp ne ptr [[SHADOW_DESC6]], @__tysan_v1___ZTS1yIN12__GLOBAL____N__11zEE_o_24
+; CHECK-NEXT:    br i1 [[BAD_DESC7]], label [[TMP44:%.*]], label [[TMP66:%.*]], !prof [[PROF0]]
+; CHECK:       44:
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp eq ptr [[SHADOW_DESC6]], null
+; CHECK-NEXT:    br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP64:%.*]]
+; CHECK:       46:
+; CHECK-NEXT:    [[TMP47:%.*]] = add i64 [[SHADOW_PTR_INT4]], 8
+; CHECK-NEXT:    [[TMP48:%.*]] = inttoptr i64 [[TMP47]] to ptr
+; CHECK-NEXT:    [[TMP49:%.*]] = load ptr, ptr [[TMP48]], align 8
+; CHECK-NEXT:    [[TMP50:%.*]] = icmp ne ptr [[TMP49]], null
+; CHECK-NEXT:    [[TMP51:%.*]] = or i1 false, [[TMP50]]
+; CHECK-NEXT:    [[TMP52:%.*]] = add i64 [[SHADOW_PTR_INT4]], 16
+; CHECK-NEXT:    [[TMP53:%.*]] = inttoptr i64 [[TMP52]] to ptr
+; CHECK-NEXT:    [[TMP54:%.*]] = load ptr, ptr [[TMP53]], align 8
+; CHECK-NEXT:    [[TMP55:%.*]] = icmp ne ptr [[TMP54]], null
+; CHECK-NEXT:    [[TMP56:%.*]] = or i1 [[TMP51]], [[TMP55]]
+; CHECK-NEXT:    [[TMP57:%.*]] = add i64 [[SHADOW_PTR_INT4]], 24
+; CHECK-NEXT:    [[TMP58:%.*]] = inttoptr i64 [[TMP57]] to ptr
+; CHECK-NEXT:    [[TMP59:%.*]] = load ptr, ptr [[TMP58]], align 8
+; CHECK-NEXT:    [[TMP60:%.*]] = icmp ne ptr [[TMP59]], null
+; CHECK-NEXT:    [[TMP61:%.*]] = or i1 [[TMP56]], [[TMP60]]
+; CHECK-NEXT:    br i1 [[TMP61]], label [[TMP62:%.*]], label [[TMP63:%.*]], !prof [[PROF0]]
+; CHECK:       62:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[B]], i32 4, ptr @__tysan_v1___ZTS1yIN12__GLOBAL____N__11zEE_o_24, i32 2)
+; CHECK-NEXT:    br label [[TMP63]]
+; CHECK:       63:
+; CHECK-NEXT:    store ptr @__tysan_v1___ZTS1yIN12__GLOBAL____N__11zEE_o_24, ptr [[SHADOW_PTR5]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET8:%.*]] = add i64 [[SHADOW_PTR_INT4]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR9:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET8]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR9]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET10:%.*]] = add i64 [[SHADOW_PTR_INT4]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR11:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET10]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR11]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET12:%.*]] = add i64 [[SHADOW_PTR_INT4]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR13:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET12]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR13]], align 8
+; CHECK-NEXT:    br label [[TMP65:%.*]]
+; CHECK:       64:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[B]], i32 4, ptr @__tysan_v1___ZTS1yIN12__GLOBAL____N__11zEE_o_24, i32 2)
+; CHECK-NEXT:    br label [[TMP65]]
+; CHECK:       65:
+; CHECK-NEXT:    br label [[TMP87:%.*]]
+; CHECK:       66:
+; CHECK-NEXT:    [[TMP67:%.*]] = add i64 [[SHADOW_PTR_INT4]], 8
+; CHECK-NEXT:    [[TMP68:%.*]] = inttoptr i64 [[TMP67]] to ptr
+; CHECK-NEXT:    [[TMP69:%.*]] = load ptr, ptr [[TMP68]], align 8
+; CHECK-NEXT:    [[TMP70:%.*]] = ptrtoint ptr [[TMP69]] to i64
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp sge i64 [[TMP70]], 0
+; CHECK-NEXT:    [[TMP72:%.*]] = or i1 false, [[TMP71]]
+; CHECK-NEXT:    [[TMP73:%.*]] = add i64 [[SHADOW_PTR_INT4]], 16
+; CHECK-NEXT:    [[TMP74:%.*]] = inttoptr i64 [[TMP73]] to ptr
+; CHECK-NEXT:    [[TMP75:%.*]] = load ptr, ptr [[TMP74]], align 8
+; CHECK-NEXT:    [[TMP76:%.*]] = ptrtoint ptr [[TMP75]] to i64
+; CHECK-NEXT:    [[TMP77:%.*]] = icmp sge i64 [[TMP76]], 0
+; CHECK-NEXT:    [[TMP78:%.*]] = or i1 [[TMP72]], [[TMP77]]
+; CHECK-NEXT:    [[TMP79:%.*]] = add i64 [[SHADOW_PTR_INT4]], 24
+; CHECK-NEXT:    [[TMP80:%.*]] = inttoptr i64 [[TMP79]] to ptr
+; CHECK-NEXT:    [[TMP81:%.*]] = load ptr, ptr [[TMP80]], align 8
+; CHECK-NEXT:    [[TMP82:%.*]] = ptrtoint ptr [[TMP81]] to i64
+; CHECK-NEXT:    [[TMP83:%.*]] = icmp sge i64 [[TMP82]], 0
+; CHECK-NEXT:    [[TMP84:%.*]] = or i1 [[TMP78]], [[TMP83]]
+; CHECK-NEXT:    br i1 [[TMP84]], label [[TMP85:%.*]], label [[TMP86:%.*]], !prof [[PROF0]]
+; CHECK:       85:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[B]], i32 4, ptr @__tysan_v1___ZTS1yIN12__GLOBAL____N__11zEE_o_24, i32 2)
+; CHECK-NEXT:    br label [[TMP86]]
+; CHECK:       86:
+; CHECK-NEXT:    br label [[TMP87]]
+; CHECK:       87:
+; CHECK-NEXT:    store i32 43, ptr [[B]], align 4, !tbaa [[TBAA6:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i32 42, ptr %a, align 4, !tbaa !8
+  store i32 43, ptr %b, align 4, !tbaa !10
+  ret void
+
+}
+
+define void @test_anon_type(ptr %a) sanitize_type {
+; CHECK-LABEL: @test_anon_type(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], @__tysan_v1_____anonymous__027d9e575c5d34cb5d60d6a1d6276f95_o_24
+; CHECK-NEXT:    br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP22:%.*]], !prof [[PROF0]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne ptr [[TMP5]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 false, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne ptr [[TMP10]], null
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null
+; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_____anonymous__027d9e575c5d34cb5d60d6a1d6276f95_o_24, i32 2)
+; CHECK-NEXT:    br label [[TMP19]]
+; CHECK:       19:
+; CHECK-NEXT:    store ptr @__tysan_v1_____anonymous__027d9e575c5d34cb5d60d6a1d6276f95_o_24, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
+; CHECK-NEXT:    br label [[TMP21:%.*]]
+; CHECK:       20:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_____anonymous__027d9e575c5d34cb5d60d6a1d6276f95_o_24, i32 2)
+; CHECK-NEXT:    br label [[TMP21]]
+; CHECK:       21:
+; CHECK-NEXT:    br label [[TMP43:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+; CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp sge i64 [[TMP26]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = or i1 false, [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+; CHECK-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[TMP30]], align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP31]] to i64
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp sge i64 [[TMP32]], 0
+; CHECK-NEXT:    [[TMP34:%.*]] = or i1 [[TMP28]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+; CHECK-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[TMP36]], align 8
+; CHECK-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP37]] to i64
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp sge i64 [[TMP38]], 0
+; CHECK-NEXT:    [[TMP40:%.*]] = or i1 [[TMP34]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[TMP40]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF0]]
+; CHECK:       41:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_____anonymous__027d9e575c5d34cb5d60d6a1d6276f95_o_24, i32 2)
+; CHECK-NEXT:    br label [[TMP42]]
+; CHECK:       42:
+; CHECK-NEXT:    br label [[TMP43]]
+; CHECK:       43:
+; CHECK-NEXT:    store i32 42, ptr [[A]], align 4, !tbaa [[TBAA8:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i32 42, ptr %a, align 4, !tbaa !12
+  ret void
+
+}
+
+!0 = !{!"Simple C++ TBAA"}
+!1 = !{!"omnipotent char", !0, i64 0}
+!2 = !{!"int", !1, i64 0}
+!7 = !{!"_ZTSN12_GLOBAL__N_11zE", !2, i64 24}
+!8 = !{!7, !2, i64 24}
+!9 = !{!"_ZTS1yIN12_GLOBAL__N_11zEE", !2, i64 24}
+!10 = !{!9, !2, i64 24}
+!11 = !{!"", !2, i64 24}
+!12 = !{!11, !2, i64 24}
diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll b/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll
new file mode 100644
index 00000000000000..7f25e36e6660e8
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/basic-nosan.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs
+; Test basic type sanitizer instrumentation.
+; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK: @[[LLVM_GLOBAL_CTORS:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
+; CHECK: @[[__TYSAN_V1_SIMPLE_20C_2B_2B_20TBAA:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, [16 x i8] } { i64 2, i64 0, [16 x i8] c"Simple C++ TBAA\00" }, comdat
+; CHECK: @[[__TYSAN_V1_OMNIPOTENT_20CHAR:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, ptr, i64, [16 x i8] } { i64 2, i64 1, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, i64 0, [16 x i8] c"omnipotent char\00" }, comdat
+; CHECK: @[[__TYSAN_V1_INT:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, ptr, i64, [4 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [4 x i8] c"int\00" }, comdat
+; CHECK: @[[__TYSAN_V1_INT_O_0:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1_int, ptr @__tysan_v1_int, i64 0 }, comdat
+; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [5 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_int_o_0], section "llvm.metadata"
+; CHECK: @[[__TYSAN_SHADOW_MEMORY_ADDRESS:[a-zA-Z0-9_$"\\.-]+]] = external global i64
+; CHECK: @[[__TYSAN_APP_MEMORY_MASK:[a-zA-Z0-9_$"\\.-]+]] = external global i64
+;
+define i32 @test_load_nsan(ptr %a) {
+; CHECK-LABEL: @test_load_nsan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[DESC_SET:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[DESC_SET]], label [[SET_TYPE:%.*]], label [[TMP0:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       set.type:
+; CHECK-NEXT:    store ptr @__tysan_v1_int_o_0, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
+; CHECK-NEXT:    br label [[TMP0]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  %tmp1 = load i32, ptr %a, align 4, !tbaa !3
+  ret i32 %tmp1
+}
+
+define void @test_store_nsan(ptr %a) {
+; CHECK-LABEL: @test_store_nsan(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[DESC_SET:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[DESC_SET]], label [[SET_TYPE:%.*]], label [[TMP0:%.*]], !prof [[PROF0]]
+; CHECK:       set.type:
+; CHECK-NEXT:    store ptr @__tysan_v1_int_o_0, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
+; CHECK-NEXT:    br label [[TMP0]]
+; CHECK:       0:
+; CHECK-NEXT:    store i32 42, ptr [[A]], align 4, !tbaa [[TBAA1]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i32 42, ptr %a, align 4, !tbaa !3
+  ret void
+}
+
+; CHECK-LABEL: @tysan.module_ctor(
+; CHECK-NEXT:    call void @__tysan_init()
+; CHECK-NEXT:    ret void
+;
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 100000}
+
+!0 = !{!"Simple C++ TBAA"}
+!1 = !{!"omnipotent char", !0, i64 0}
+!2 = !{!"int", !1, i64 0}
+!3 = !{!2, !2, i64 0}
diff --git a/llvm/test/Instrumentation/TypeSanitizer/basic.ll b/llvm/test/Instrumentation/TypeSanitizer/basic.ll
new file mode 100644
index 00000000000000..132df722e83c25
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/basic.ll
@@ -0,0 +1,214 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; Test basic type sanitizer instrumentation.
+;
+; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+
+; CHECK: @[[LLVM_GLOBAL_CTORS:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
+; CHECK: @[[__TYSAN_V1_SIMPLE_20C_2B_2B_20TBAA:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, [16 x i8] } { i64 2, i64 0, [16 x i8] c"Simple C++ TBAA\00" }, comdat
+; CHECK: @[[__TYSAN_V1_OMNIPOTENT_20CHAR:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, ptr, i64, [16 x i8] } { i64 2, i64 1, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, i64 0, [16 x i8] c"omnipotent char\00" }, comdat
+; CHECK: @[[__TYSAN_V1_INT:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, ptr, i64, [4 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [4 x i8] c"int\00" }, comdat
+; CHECK: @[[__TYSAN_V1_INT_O_0:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1_int, ptr @__tysan_v1_int, i64 0 }, comdat
+; CHECK: @[[__TYSAN_SHADOW_MEMORY_ADDRESS:[a-zA-Z0-9_$"\\.-]+]] = external global i64
+; CHECK: @[[__TYSAN_APP_MEMORY_MASK:[a-zA-Z0-9_$"\\.-]+]] = external global i64
+; CHECK: @[[__TYSAN_V1___ZTS1X:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 2, ptr @__tysan_v1_int, i64 0, ptr @__tysan_v1_int, i64 4, [7 x i8] c"_ZTS1x\00" }, comdat
+; CHECK: @[[__TYSAN_V1___ZTS1V:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, ptr, i64, [7 x i8] } { i64 2, i64 3, ptr @__tysan_v1_int, i64 8, ptr @__tysan_v1_int, i64 12, ptr @__tysan_v1___ZTS1x, i64 16, [7 x i8] c"_ZTS1v\00" }, comdat
+; CHECK: @[[__TYSAN_V1___ZTS1V_O_12:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, ptr, ptr, i64 } { i64 1, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1_int, i64 12 }, comdat
+; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [8 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_int_o_0, ptr @__tysan_v1___ZTS1x, ptr @__tysan_v1___ZTS1v, ptr @__tysan_v1___ZTS1v_o_12], section "llvm.metadata"
+
+define i32 @test_load(ptr %a) sanitize_type {
+; CHECK-LABEL: @test_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], @__tysan_v1_int_o_0
+; CHECK-NEXT:    br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP22:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne ptr [[TMP5]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 false, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne ptr [[TMP10]], null
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null
+; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1)
+; CHECK-NEXT:    br label [[TMP19]]
+; CHECK:       19:
+; CHECK-NEXT:    store ptr @__tysan_v1_int_o_0, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
+; CHECK-NEXT:    br label [[TMP21:%.*]]
+; CHECK:       20:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1)
+; CHECK-NEXT:    br label [[TMP21]]
+; CHECK:       21:
+; CHECK-NEXT:    br label [[TMP43:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+; CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp sge i64 [[TMP26]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = or i1 false, [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+; CHECK-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[TMP30]], align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP31]] to i64
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp sge i64 [[TMP32]], 0
+; CHECK-NEXT:    [[TMP34:%.*]] = or i1 [[TMP28]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+; CHECK-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[TMP36]], align 8
+; CHECK-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP37]] to i64
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp sge i64 [[TMP38]], 0
+; CHECK-NEXT:    [[TMP40:%.*]] = or i1 [[TMP34]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[TMP40]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF0]]
+; CHECK:       41:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1_int_o_0, i32 1)
+; CHECK-NEXT:    br label [[TMP42]]
+; CHECK:       42:
+; CHECK-NEXT:    br label [[TMP43]]
+; CHECK:       43:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  %tmp1 = load i32, ptr %a, align 4, !tbaa !3
+  ret i32 %tmp1
+}
+
+define void @test_store(ptr %a) sanitize_type {
+; CHECK-LABEL: @test_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], @__tysan_v1___ZTS1v_o_12
+; CHECK-NEXT:    br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP22:%.*]], !prof [[PROF0]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne ptr [[TMP5]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 false, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne ptr [[TMP10]], null
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null
+; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2)
+; CHECK-NEXT:    br label [[TMP19]]
+; CHECK:       19:
+; CHECK-NEXT:    store ptr @__tysan_v1___ZTS1v_o_12, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
+; CHECK-NEXT:    br label [[TMP21:%.*]]
+; CHECK:       20:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2)
+; CHECK-NEXT:    br label [[TMP21]]
+; CHECK:       21:
+; CHECK-NEXT:    br label [[TMP43:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+; CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp sge i64 [[TMP26]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = or i1 false, [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+; CHECK-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[TMP30]], align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP31]] to i64
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp sge i64 [[TMP32]], 0
+; CHECK-NEXT:    [[TMP34:%.*]] = or i1 [[TMP28]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+; CHECK-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[TMP36]], align 8
+; CHECK-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP37]] to i64
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp sge i64 [[TMP38]], 0
+; CHECK-NEXT:    [[TMP40:%.*]] = or i1 [[TMP34]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[TMP40]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF0]]
+; CHECK:       41:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr @__tysan_v1___ZTS1v_o_12, i32 2)
+; CHECK-NEXT:    br label [[TMP42]]
+; CHECK:       42:
+; CHECK-NEXT:    br label [[TMP43]]
+; CHECK:       43:
+; CHECK-NEXT:    store i32 42, ptr [[A]], align 4, !tbaa [[TBAA5:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i32 42, ptr %a, align 4, !tbaa !6
+  ret void
+}
+
+!0 = !{!"Simple C++ TBAA"}
+!1 = !{!"omnipotent char", !0, i64 0}
+!2 = !{!"int", !1, i64 0}
+!3 = !{!2, !2, i64 0}
+!4 = !{!"_ZTS1x", !2, i64 0, !2, i64 4}
+!5 = !{!"_ZTS1v", !2, i64 8, !2, i64 12, !4, i64 16}
+!6 = !{!5, !2, i64 12}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { sanitize_type }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 1, i32 100000}
+; CHECK: [[TBAA1]] = !{!2, !2, i64 0}
+; CHECK: [[META2:![0-9]+]] = !{!"int", !3, i64 0}
+; CHECK: [[META3:![0-9]+]] = !{!"omnipotent char", !4, i64 0}
+; CHECK: [[META4:![0-9]+]] = !{!"Simple C++ TBAA"}
+; CHECK: [[TBAA5]] = !{!6, !2, i64 12}
+; CHECK: [[META6:![0-9]+]] = !{!"_ZTS1v", !2, i64 8, !2, i64 12, !7, i64 16}
+; CHECK: [[META7:![0-9]+]] = !{!"_ZTS1x", !2, i64 0, !2, i64 4}
+;.
diff --git a/llvm/test/Instrumentation/TypeSanitizer/byval.ll b/llvm/test/Instrumentation/TypeSanitizer/byval.ll
new file mode 100644
index 00000000000000..68ab1327b225bd
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/byval.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs
+; Test basic type sanitizer instrumentation.
+; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+;.
+; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x ptr] [ptr @tysan.module_ctor], section "llvm.metadata"
+; CHECK: @[[LLVM_GLOBAL_CTORS:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
+; CHECK: @[[__TYSAN_SHADOW_MEMORY_ADDRESS:[a-zA-Z0-9_$"\\.-]+]] = external global i64
+; CHECK: @[[__TYSAN_APP_MEMORY_MASK:[a-zA-Z0-9_$"\\.-]+]] = external global i64
+;.
+
+%struct.s20 = type { i32, i32, [24 x i8] }
+define void @byval_test(ptr byval(%struct.s20) align 32 %x) sanitize_type {
+; CHECK-LABEL: @byval_test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[X:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP4]], i8 0, i64 256, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  ret void
+; NOTE: Ideally, we'd get the type from the caller's copy of the data (instead
+; of setting it all to unknown).
+}
+
+%struct = type { ptr, ptr }
+
+define ptr @test_insert_point(ptr byval(%struct) %v) {
+; CHECK-LABEL: @test_insert_point(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[V:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP4]], i8 0, i64 128, i1 false)
+; CHECK-NEXT:    [[NAME:%.*]] = getelementptr inbounds [[STRUCT:%.*]], ptr [[V]], i64 0, i32 1
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[NAME]] to i64
+; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[DESC_SET:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[DESC_SET]], label [[SET_TYPE:%.*]], label [[TMP5:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       set.type:
+; CHECK-NEXT:    store ptr null, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_4_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 32
+; CHECK-NEXT:    [[SHADOW_BYTE_4_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_4_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -4 to ptr), ptr [[SHADOW_BYTE_4_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_5_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 40
+; CHECK-NEXT:    [[SHADOW_BYTE_5_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_5_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -5 to ptr), ptr [[SHADOW_BYTE_5_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_6_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 48
+; CHECK-NEXT:    [[SHADOW_BYTE_6_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_6_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -6 to ptr), ptr [[SHADOW_BYTE_6_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_7_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 56
+; CHECK-NEXT:    [[SHADOW_BYTE_7_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_7_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -7 to ptr), ptr [[SHADOW_BYTE_7_PTR]], align 8
+; CHECK-NEXT:    br label [[TMP5]]
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[NAME]], align 8
+; CHECK-NEXT:    ret ptr [[TMP6]]
+;
+entry:
+  %name = getelementptr inbounds %struct, ptr %v, i64 0, i32 1
+  %0 = load ptr, ptr %name, align 8
+  ret ptr %0
+}
diff --git a/llvm/test/Instrumentation/TypeSanitizer/globals.ll b/llvm/test/Instrumentation/TypeSanitizer/globals.ll
new file mode 100644
index 00000000000000..05d0fd348444db
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/globals.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs
+; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+ at global1 = global i32 0, align 4
+ at global2 = global i32 0, align 4
+
+; CHECK: @[[GLOBAL1:[a-zA-Z0-9_$"\\.-]+]] = global i32 0, align 4
+; CHECK: @[[GLOBAL2:[a-zA-Z0-9_$"\\.-]+]] = global i32 0, align 4
+; CHECK: @[[__TYSAN_V1_SIMPLE_20C_2B_2B_20TBAA:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, [16 x i8] } { i64 2, i64 0, [16 x i8] c"Simple C++ TBAA\00" }, comdat
+; CHECK: @[[__TYSAN_V1_OMNIPOTENT_20CHAR:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, ptr, i64, [16 x i8] } { i64 2, i64 1, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, i64 0, [16 x i8] c"omnipotent char\00" }, comdat
+; CHECK: @[[__TYSAN_V1_INT:[a-zA-Z0-9_$"\\.-]+]] = linkonce_odr constant { i64, i64, ptr, i64, [4 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [4 x i8] c"int\00" }, comdat
+; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [4 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int], section "llvm.metadata"
+; CHECK: @[[__TYSAN_SHADOW_MEMORY_ADDRESS:[a-zA-Z0-9_$"\\.-]+]] = external global i64
+; CHECK: @[[__TYSAN_APP_MEMORY_MASK:[a-zA-Z0-9_$"\\.-]+]] = external global i64
+; CHECK: @[[LLVM_GLOBAL_CTORS:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
+;.
+; CHECK-LABEL: define internal void @tysan.module_ctor(
+; CHECK-NEXT:    call void @__tysan_init()
+; CHECK-NEXT:    call void @__tysan_set_globals_types()
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: define internal void @__tysan_set_globals_types(
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 ptrtoint (ptr @global1 to i64), [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
+; CHECK-NEXT:    store ptr @__tysan_v1_int, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
+; CHECK-NEXT:    [[APP_PTR_MASKED1:%.*]] = and i64 ptrtoint (ptr @global1 to i64), [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED2:%.*]] = shl i64 [[APP_PTR_MASKED1]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT3:%.*]] = add i64 [[APP_PTR_SHIFTED2]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[SHADOW_PTR4:%.*]] = inttoptr i64 [[SHADOW_PTR_INT3]] to ptr
+; CHECK-NEXT:    store ptr @__tysan_v1_int, ptr [[SHADOW_PTR4]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET5:%.*]] = add i64 [[SHADOW_PTR_INT3]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR6:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET5]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR6]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET7:%.*]] = add i64 [[SHADOW_PTR_INT3]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR8:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET7]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR8]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET9:%.*]] = add i64 [[SHADOW_PTR_INT3]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR10:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET9]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR10]], align 8
+; CHECK-NEXT:    ret void
+
+
+
+!llvm.tysan.globals = !{!13, !14}
+
+!0 = !{!"Simple C++ TBAA"}
+!1 = !{!"omnipotent char", !0, i64 0}
+!2 = !{!"int", !1, i64 0}
+!13 = !{ptr @global1, !2}
+!14 = !{ptr @global1, !2}
diff --git a/llvm/test/Instrumentation/TypeSanitizer/invalid-metadata.ll b/llvm/test/Instrumentation/TypeSanitizer/invalid-metadata.ll
new file mode 100644
index 00000000000000..4527aa5cf2a015
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/invalid-metadata.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs
+; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+
+!llvm.tysan.globals = !{!0}
+
+!0 = distinct !{ptr undef, !1}
+!1 = !{!"any pointer", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
+;.
+; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x ptr] [ptr @tysan.module_ctor], section "llvm.metadata"
+; CHECK: @[[LLVM_GLOBAL_CTORS:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
+;.
+; CHECK-LABEL: @tysan.module_ctor(
+; CHECK-NEXT:    call void @__tysan_init()
+; CHECK-NEXT:    ret void
+;
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind }
+;.
+; CHECK: [[META0:![0-9]+]] = distinct !{ptr undef, !1}
+; CHECK: [[META1:![0-9]+]] = !{!"any pointer", !2, i64 0}
+; CHECK: [[META2:![0-9]+]] = !{!"omnipotent char", !3, i64 0}
+; CHECK: [[META3:![0-9]+]] = !{!"Simple C/C++ TBAA"}
+;.
diff --git a/llvm/test/Instrumentation/TypeSanitizer/memintrinsics.ll b/llvm/test/Instrumentation/TypeSanitizer/memintrinsics.ll
new file mode 100644
index 00000000000000..26f7c186748cb6
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/memintrinsics.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Test basic type sanitizer instrumentation.
+;
+; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i32, i1) nounwind
+declare void @llvm.memmove.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i32, i1) nounwind
+
+define void @test_memset(ptr %a, ptr %b) nounwind uwtable sanitize_type {
+; CHECK-LABEL: @test_memset(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[TMP4]], i8 0, i64 800, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[A]], i8 0, i64 100, i1 false)
+; CHECK-NEXT:    ret void
+;
+  entry:
+  tail call void @llvm.memset.p0.i64(ptr %a, i8 0, i64 100, i32 1, i1 false)
+  ret void
+}
+
+define void @test_memmove(ptr %a, ptr %b) nounwind uwtable sanitize_type {
+; CHECK-LABEL: @test_memmove(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = and i64 [[TMP5]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 8 [[TMP4]], ptr align 8 [[TMP9]], i64 800, i1 false)
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 1 [[A]], ptr align 1 [[B]], i64 100, i1 false)
+; CHECK-NEXT:    ret void
+;
+  entry:
+  tail call void @llvm.memmove.p0.p0.i64(ptr %a, ptr %b, i64 100, i32 1, i1 false)
+  ret void
+}
+
+define void @test_memcpy(ptr %a, ptr %b) nounwind uwtable sanitize_type {
+; CHECK-LABEL: @test_memcpy(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = and i64 [[TMP5]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP4]], ptr align 8 [[TMP9]], i64 800, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[A]], ptr align 1 [[B]], i64 100, i1 false)
+; CHECK-NEXT:    ret void
+;
+  entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %a, ptr %b, i64 100, i32 1, i1 false)
+  ret void
+}
diff --git a/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll b/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll
new file mode 100644
index 00000000000000..67e408439ec165
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/nosanitize.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
+; Test basic type sanitizer instrumentation.
+;
+; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+;.
+; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x ptr] [ptr @tysan.module_ctor], section "llvm.metadata"
+; CHECK: @[[LLVM_GLOBAL_CTORS:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
+;.
+define i32 @test_load(ptr %a) sanitize_type {
+; CHECK-LABEL: @test_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]], !nosanitize !4
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  %tmp1 = load i32, ptr %a, align 4, !tbaa !3, !nosanitize !{}
+  ret i32 %tmp1
+}
+
+!0 = !{!"Simple C++ TBAA"}
+!1 = !{!"omnipotent char", !0, i64 0}
+!2 = !{!"int", !1, i64 0}
+!3 = !{!2, !2, i64 0}
+!4 = !{!"_ZTS1x", !2, i64 0, !2, i64 4}
+!5 = !{!"_ZTS1v", !2, i64 8, !2, i64 12, !4, i64 16}
+!6 = !{!5, !2, i64 12}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { sanitize_type }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+;.
+; CHECK: [[TBAA0]] = !{!1, !1, i64 0}
+; CHECK: [[META1:![0-9]+]] = !{!"int", !2, i64 0}
+; CHECK: [[META2:![0-9]+]] = !{!"omnipotent char", !3, i64 0}
+; CHECK: [[META3:![0-9]+]] = !{!"Simple C++ TBAA"}
+; CHECK: [[META4:![0-9]+]] = !{}
+;.
diff --git a/llvm/test/Instrumentation/TypeSanitizer/sanitize-no-tbaa.ll b/llvm/test/Instrumentation/TypeSanitizer/sanitize-no-tbaa.ll
new file mode 100644
index 00000000000000..3cb7b8365866b0
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/sanitize-no-tbaa.ll
@@ -0,0 +1,180 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Test basic type sanitizer instrumentation.
+;
+; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i32 @test_load_unk(ptr %a) sanitize_type {
+; CHECK-LABEL: @test_load_unk(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP22:%.*]], !prof [[PROF0:![0-9]+]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne ptr [[TMP5]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 false, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne ptr [[TMP10]], null
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null
+; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP19]]
+; CHECK:       19:
+; CHECK-NEXT:    store ptr null, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
+; CHECK-NEXT:    br label [[TMP21:%.*]]
+; CHECK:       20:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP21]]
+; CHECK:       21:
+; CHECK-NEXT:    br label [[TMP43:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+; CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp sge i64 [[TMP26]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = or i1 false, [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+; CHECK-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[TMP30]], align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP31]] to i64
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp sge i64 [[TMP32]], 0
+; CHECK-NEXT:    [[TMP34:%.*]] = or i1 [[TMP28]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+; CHECK-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[TMP36]], align 8
+; CHECK-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP37]] to i64
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp sge i64 [[TMP38]], 0
+; CHECK-NEXT:    [[TMP40:%.*]] = or i1 [[TMP34]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[TMP40]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF0]]
+; CHECK:       41:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr null, i32 1)
+; CHECK-NEXT:    br label [[TMP42]]
+; CHECK:       42:
+; CHECK-NEXT:    br label [[TMP43]]
+; CHECK:       43:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  %tmp1 = load i32, ptr %a, align 4
+  ret i32 %tmp1
+}
+
+define void @test_store_unk(ptr %a) sanitize_type {
+; CHECK-LABEL: @test_store_unk(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[APP_MEM_MASK:%.*]] = load i64, ptr @__tysan_app_memory_mask, align 8
+; CHECK-NEXT:    [[SHADOW_BASE:%.*]] = load i64, ptr @__tysan_shadow_memory_address, align 8
+; CHECK-NEXT:    [[APP_PTR_INT:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[APP_PTR_MASKED:%.*]] = and i64 [[APP_PTR_INT]], [[APP_MEM_MASK]]
+; CHECK-NEXT:    [[APP_PTR_SHIFTED:%.*]] = shl i64 [[APP_PTR_MASKED]], 3
+; CHECK-NEXT:    [[SHADOW_PTR_INT:%.*]] = add i64 [[APP_PTR_SHIFTED]], [[SHADOW_BASE]]
+; CHECK-NEXT:    [[SHADOW_PTR:%.*]] = inttoptr i64 [[SHADOW_PTR_INT]] to ptr
+; CHECK-NEXT:    [[SHADOW_DESC:%.*]] = load ptr, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[BAD_DESC:%.*]] = icmp ne ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[BAD_DESC]], label [[TMP0:%.*]], label [[TMP22:%.*]], !prof [[PROF0]]
+; CHECK:       0:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq ptr [[SHADOW_DESC]], null
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
+; CHECK:       2:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne ptr [[TMP5]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 false, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne ptr [[TMP10]], null
+; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP7]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne ptr [[TMP15]], null
+; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF0]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr null, i32 2)
+; CHECK-NEXT:    br label [[TMP19]]
+; CHECK:       19:
+; CHECK-NEXT:    store ptr null, ptr [[SHADOW_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[SHADOW_BYTE_1_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_1_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -1 to ptr), ptr [[SHADOW_BYTE_1_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_2_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[SHADOW_BYTE_2_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_2_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -2 to ptr), ptr [[SHADOW_BYTE_2_PTR]], align 8
+; CHECK-NEXT:    [[SHADOW_BYTE_3_OFFSET:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[SHADOW_BYTE_3_PTR:%.*]] = inttoptr i64 [[SHADOW_BYTE_3_OFFSET]] to ptr
+; CHECK-NEXT:    store ptr inttoptr (i64 -3 to ptr), ptr [[SHADOW_BYTE_3_PTR]], align 8
+; CHECK-NEXT:    br label [[TMP21:%.*]]
+; CHECK:       20:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr null, i32 2)
+; CHECK-NEXT:    br label [[TMP21]]
+; CHECK:       21:
+; CHECK-NEXT:    br label [[TMP43:%.*]]
+; CHECK:       22:
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[SHADOW_PTR_INT]], 8
+; CHECK-NEXT:    [[TMP24:%.*]] = inttoptr i64 [[TMP23]] to ptr
+; CHECK-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[TMP24]], align 8
+; CHECK-NEXT:    [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp sge i64 [[TMP26]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = or i1 false, [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add i64 [[SHADOW_PTR_INT]], 16
+; CHECK-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr
+; CHECK-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[TMP30]], align 8
+; CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[TMP31]] to i64
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp sge i64 [[TMP32]], 0
+; CHECK-NEXT:    [[TMP34:%.*]] = or i1 [[TMP28]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = add i64 [[SHADOW_PTR_INT]], 24
+; CHECK-NEXT:    [[TMP36:%.*]] = inttoptr i64 [[TMP35]] to ptr
+; CHECK-NEXT:    [[TMP37:%.*]] = load ptr, ptr [[TMP36]], align 8
+; CHECK-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[TMP37]] to i64
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp sge i64 [[TMP38]], 0
+; CHECK-NEXT:    [[TMP40:%.*]] = or i1 [[TMP34]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[TMP40]], label [[TMP41:%.*]], label [[TMP42:%.*]], !prof [[PROF0]]
+; CHECK:       41:
+; CHECK-NEXT:    call void @__tysan_check(ptr [[A]], i32 4, ptr null, i32 2)
+; CHECK-NEXT:    br label [[TMP42]]
+; CHECK:       42:
+; CHECK-NEXT:    br label [[TMP43]]
+; CHECK:       43:
+; CHECK-NEXT:    store i32 42, ptr [[A]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  store i32 42, ptr %a, align 4
+  ret void
+}
diff --git a/llvm/test/Instrumentation/TypeSanitizer/swifterror.ll b/llvm/test/Instrumentation/TypeSanitizer/swifterror.ll
new file mode 100644
index 00000000000000..5711fb4b839f4d
--- /dev/null
+++ b/llvm/test/Instrumentation/TypeSanitizer/swifterror.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Test basic type sanitizer instrumentation.
+;
+; RUN: opt -passes='tysan-module,tysan' -S %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @test_swifterror(ptr swifterror) sanitize_type {
+; CHECK-LABEL: @test_swifterror(
+; CHECK-NEXT:    [[SWIFTERROR_PTR_VALUE:%.*]] = load ptr, ptr [[TMP0:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  %swifterror_ptr_value = load ptr, ptr %0
+  ret void
+}
+
+define void @test_swifterror_2(ptr swifterror) sanitize_type {
+; CHECK-LABEL: @test_swifterror_2(
+; CHECK-NEXT:    store ptr null, ptr [[TMP0:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+  store ptr null, ptr %0
+  ret void
+}

>From 78b4a820d39de8899039f14046b88a26895e2025 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 18 Apr 2024 22:52:16 +0100
Subject: [PATCH 13/18] !fixup address comments, thanks

---
 .../llvm/Analysis/TypeBasedAliasAnalysis.h    |  8 ++++++
 llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp  | 26 ++++---------------
 .../Instrumentation/TypeSanitizer.cpp         | 22 +++++++---------
 .../Analysis/AliasSetTrackerTest.cpp          |  2 +-
 4 files changed, 24 insertions(+), 34 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TypeBasedAliasAnalysis.h b/llvm/include/llvm/Analysis/TypeBasedAliasAnalysis.h
index 36dd39c033aa63..0ec84a461a3db7 100644
--- a/llvm/include/llvm/Analysis/TypeBasedAliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/TypeBasedAliasAnalysis.h
@@ -29,7 +29,15 @@ class MemoryLocation;
 
 /// A simple AA result that uses TBAA metadata to answer queries.
 class TypeBasedAAResult : public AAResultBase {
+  /// True if type sanitizer is enabled. When TypeSanitizer is used, don't use
+  /// TBAA information for alias analysis as  this might cause us to remove
+  /// memory accesses that we need to verify at runtime.
+  bool UsingTypeSanitizer;
+
 public:
+  TypeBasedAAResult(bool UsingTypeSanitizer)
+      : UsingTypeSanitizer(UsingTypeSanitizer) {}
+
   /// Handle invalidation events from the new pass manager.
   ///
   /// By definition, this result is stateless and so remains valid.
diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index a499e16ff0097f..08c7736af477e9 100644
--- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -372,26 +372,10 @@ static bool isStructPathTBAA(const MDNode *MD) {
   return isa<MDNode>(MD->getOperand(0)) && MD->getNumOperands() >= 3;
 }
 
-// When using the TypeSanitizer, don't use TBAA information for alias analysis.
-// This might cause us to remove memory accesses that we need to verify at
-// runtime.
-static bool usingSanitizeType(const Value *V) {
-  const Function *F;
-
-  if (auto *I = dyn_cast<Instruction>(V))
-    F = I->getParent()->getParent();
-  else if (auto *A = dyn_cast<Argument>(V))
-    F = A->getParent();
-  else
-    return false;
-
-  return F->hasFnAttribute(Attribute::SanitizeType);
-}
-
 AliasResult TypeBasedAAResult::alias(const MemoryLocation &LocA,
                                      const MemoryLocation &LocB,
                                      AAQueryInfo &AAQI, const Instruction *) {
-  if (!EnableTBAA || usingSanitizeType(LocA.Ptr) || usingSanitizeType(LocB.Ptr))
+  if (!EnableTBAA || UsingTypeSanitizer || UsingTypeSanitizer)
     return AAResultBase::alias(LocA, LocB, AAQI, nullptr);
 
   if (Aliases(LocA.AATags.TBAA, LocB.AATags.TBAA))
@@ -442,7 +426,7 @@ MemoryEffects TypeBasedAAResult::getMemoryEffects(const Function *F) {
 ModRefInfo TypeBasedAAResult::getModRefInfo(const CallBase *Call,
                                             const MemoryLocation &Loc,
                                             AAQueryInfo &AAQI) {
-  if (!EnableTBAA || usingSanitizeType(Call))
+  if (!EnableTBAA || UsingTypeSanitizer)
     return AAResultBase::getModRefInfo(Call, Loc, AAQI);
 
   if (const MDNode *L = Loc.AATags.TBAA)
@@ -456,7 +440,7 @@ ModRefInfo TypeBasedAAResult::getModRefInfo(const CallBase *Call,
 ModRefInfo TypeBasedAAResult::getModRefInfo(const CallBase *Call1,
                                             const CallBase *Call2,
                                             AAQueryInfo &AAQI) {
-  if (!EnableTBAA || usingSanitizeType(Call1))
+  if (!EnableTBAA || UsingTypeSanitizer)
     return AAResultBase::getModRefInfo(Call1, Call2, AAQI);
 
   if (const MDNode *M1 = Call1->getMetadata(LLVMContext::MD_tbaa))
@@ -724,7 +708,7 @@ bool TypeBasedAAResult::Aliases(const MDNode *A, const MDNode *B) const {
 AnalysisKey TypeBasedAA::Key;
 
 TypeBasedAAResult TypeBasedAA::run(Function &F, FunctionAnalysisManager &AM) {
-  return TypeBasedAAResult();
+  return TypeBasedAAResult(F.hasFnAttribute(Attribute::SanitizeType));
 }
 
 char TypeBasedAAWrapperPass::ID = 0;
@@ -740,7 +724,7 @@ TypeBasedAAWrapperPass::TypeBasedAAWrapperPass() : ImmutablePass(ID) {
 }
 
 bool TypeBasedAAWrapperPass::doInitialization(Module &M) {
-  Result.reset(new TypeBasedAAResult());
+  Result.reset(new TypeBasedAAResult(false));
   return false;
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
index ed4aba4ad612d9..2ca9b8a8d8ce41 100644
--- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -110,6 +110,7 @@ struct TypeSanitizer {
                                   TypeNameMapTy &TypeNames, Module &M);
 
   const Triple TargetTriple;
+  Regex AnonNameRegex;
   Type *IntptrTy;
   uint64_t PtrShift;
   IntegerType *OrdTy;
@@ -122,7 +123,8 @@ struct TypeSanitizer {
 } // namespace
 
 TypeSanitizer::TypeSanitizer(Module &M)
-    : TargetTriple(Triple(M.getTargetTriple())) {
+    : TargetTriple(Triple(M.getTargetTriple())),
+      AnonNameRegex("^_ZTS.*N[1-9][0-9]*_GLOBAL__N") {
   const DataLayout &DL = M.getDataLayout();
   IntptrTy = DL.getIntPtrType(M.getContext());
   PtrShift = countr_zero(IntptrTy->getPrimitiveSizeInBits() / 8);
@@ -237,16 +239,6 @@ static std::string encodeName(StringRef Name) {
   return Output;
 }
 
-static bool isAnonymousNamespaceName(StringRef Name) {
-  // Types that are in an anonymous namespace are local to this module.
-  // FIXME: This should really be marked by the frontend in the metadata
-  // instead of having us guess this from the mangled name. Moreover, the regex
-  // here can pick up (unlikely) names in the non-reserved namespace (because
-  // it needs to search into the type to pick up cases where the type in the
-  // anonymous namespace is a template parameter, etc.).
-  return AnonNameRegex.match(Name);
-}
-
 std::string
 TypeSanitizer::getAnonymousStructIdentifier(const MDNode *MD,
                                             TypeNameMapTy &TypeNames) {
@@ -352,7 +344,13 @@ bool TypeSanitizer::generateBaseTypeDescriptor(
   TDSubTys.push_back(IntptrTy);
   TDSubData.push_back(ConstantInt::get(IntptrTy, Members.size()));
 
-  bool ShouldBeComdat = !isAnonymousNamespaceName(NameNode->getString());
+  // Types that are in an anonymous namespace are local to this module.
+  // FIXME: This should really be marked by the frontend in the metadata
+  // instead of having us guess this from the mangled name. Moreover, the regex
+  // here can pick up (unlikely) names in the non-reserved namespace (because
+  // it needs to search into the type to pick up cases where the type in the
+  // anonymous namespace is a template parameter, etc.).
+  bool ShouldBeComdat = !AnonNameRegex.match(NameNode->getString());
   for (auto &Member : Members) {
     TDSubTys.push_back(Member.first->getType());
     TDSubData.push_back(Member.first);
diff --git a/llvm/unittests/Analysis/AliasSetTrackerTest.cpp b/llvm/unittests/Analysis/AliasSetTrackerTest.cpp
index 68bd41a1e8589b..e784e6eefb79c6 100644
--- a/llvm/unittests/Analysis/AliasSetTrackerTest.cpp
+++ b/llvm/unittests/Analysis/AliasSetTrackerTest.cpp
@@ -62,7 +62,7 @@ TEST(AliasSetTracker, AliasUnknownInst) {
   TargetLibraryInfoImpl TLII(Trip);
   TargetLibraryInfo TLI(TLII);
   AAResults AA(TLI);
-  TypeBasedAAResult TBAAR;
+  TypeBasedAAResult TBAAR(false);
   AA.addAAResult(TBAAR);
 
   // Initialize the alias set tracker for the @test function.

>From a5312aad78f7ef8abce272f1e5a10e4667176f0f Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 27 Jun 2024 15:08:19 +0100
Subject: [PATCH 14/18] !fiupx address comments, thanks!

---
 llvm/include/llvm/IR/Attributes.td            |  2 +-
 .../Instrumentation/TypeSanitizer.cpp         | 24 +++++++++----------
 ...-with-offfset.ll => access-with-offset.ll} |  0
 3 files changed, 12 insertions(+), 14 deletions(-)
 rename llvm/test/Instrumentation/TypeSanitizer/{access-with-offfset.ll => access-with-offset.ll} (100%)

diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 179238bc733830..61955cf883c3f1 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -318,7 +318,7 @@ def SanitizeAddress : EnumAttr<"sanitize_address", IntersectPreserve, [FnAttr]>;
 def SanitizeThread : EnumAttr<"sanitize_thread", IntersectPreserve, [FnAttr]>;
 
 /// TypeSanitizer is on.
-def SanitizeType : EnumAttr<"sanitize_type", [FnAttr]>;
+def SanitizeType : EnumAttr<"sanitize_type", IntersectPreserve, [FnAttr]>;
 
 /// MemorySanitizer is on.
 def SanitizeMemory : EnumAttr<"sanitize_memory", IntersectPreserve, [FnAttr]>;
diff --git a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
index 2ca9b8a8d8ce41..6f5f7108ada0dd 100644
--- a/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/TypeSanitizer.cpp
@@ -143,9 +143,9 @@ void TypeSanitizer::initializeCallbacks(Module &M) {
   TysanCheck = cast<Function>(
       M.getOrInsertFunction(kTysanCheckName, Attr, IRB.getVoidTy(),
                             IRB.getPtrTy(), // Pointer to data to be read.
-                            OrdTy,              // Size of the data in bytes.
+                            OrdTy,          // Size of the data in bytes.
                             IRB.getPtrTy(), // Pointer to type descriptor.
-                            OrdTy               // Flags.
+                            OrdTy           // Flags.
                             )
           .getCallee());
 
@@ -601,7 +601,7 @@ bool TypeSanitizer::instrumentWithShadowUpdate(
 
   Value *ShadowDataInt = ConvertToShadowDataInt(IRB, Ptr, IntptrTy, PtrShift,
                                                 ShadowBase, AppMemMask);
-  Type *Int8PtrPtrTy = IRB.getPtrTy()->getPointerTo();
+  Type *Int8PtrPtrTy = PointerType::get(IRB.getPtrTy(), 0);
   Value *ShadowData =
       IRB.CreateIntToPtr(ShadowDataInt, Int8PtrPtrTy, "shadow.ptr");
 
@@ -637,8 +637,7 @@ bool TypeSanitizer::instrumentWithShadowUpdate(
     Constant *Flags =
         ConstantInt::get(OrdTy, (int)IsRead | (((int)IsWrite) << 1));
 
-    Value *LoadedTD =
-        IRB.CreateLoad(IRB.getPtrTy(), ShadowData, "shadow.desc");
+    Value *LoadedTD = IRB.CreateLoad(IRB.getPtrTy(), ShadowData, "shadow.desc");
     if (SanitizeFunction) {
       Value *BadTDCmp = IRB.CreateICmpNE(LoadedTD, TD, "bad.desc");
       Instruction *BadTDTerm, *GoodTDTerm;
@@ -673,16 +672,16 @@ bool TypeSanitizer::instrumentWithShadowUpdate(
       Instruction *BadUTDTerm = SplitBlockAndInsertIfThen(
           NotAllUnkTD, BeforeSetType, false, UnlikelyBW);
       IRB.SetInsertPoint(BadUTDTerm);
-      IRB.CreateCall(TysanCheck, {IRB.CreateBitCast(Ptr, IRB.getPtrTy()),
-                                  Size, (Value *)TD, (Value *)Flags});
+      IRB.CreateCall(TysanCheck, {IRB.CreateBitCast(Ptr, IRB.getPtrTy()), Size,
+                                  (Value *)TD, (Value *)Flags});
 
       IRB.SetInsertPoint(BeforeSetType);
       SetType();
 
       // We have a non-trivial mismatch. Call the runtime.
       IRB.SetInsertPoint(MismatchTerm);
-      IRB.CreateCall(TysanCheck, {IRB.CreateBitCast(Ptr, IRB.getPtrTy()),
-                                  Size, (Value *)TD, (Value *)Flags});
+      IRB.CreateCall(TysanCheck, {IRB.CreateBitCast(Ptr, IRB.getPtrTy()), Size,
+                                  (Value *)TD, (Value *)Flags});
 
       // We appear to have the right type. Make sure that all other bytes in
       // the type are still marked as interior bytes. If not, call the runtime.
@@ -703,8 +702,8 @@ bool TypeSanitizer::instrumentWithShadowUpdate(
       Instruction *BadITDTerm = SplitBlockAndInsertIfThen(
           NotAllBadTD, &*IRB.GetInsertPoint(), false, UnlikelyBW);
       IRB.SetInsertPoint(BadITDTerm);
-      IRB.CreateCall(TysanCheck, {IRB.CreateBitCast(Ptr, IRB.getPtrTy()),
-                                  Size, (Value *)TD, (Value *)Flags});
+      IRB.CreateCall(TysanCheck, {IRB.CreateBitCast(Ptr, IRB.getPtrTy()), Size,
+                                  (Value *)TD, (Value *)Flags});
     } else {
       // If we're not sanitizing this function, then we only care whether we
       // need to *set* the type.
@@ -843,8 +842,7 @@ bool TypeSanitizer::instrumentMemInst(Value *V, Value *&ShadowBase,
           IRB.CreateAnd(IRB.CreatePtrToInt(Src, IntptrTy), AppMemMask),
           PtrShift),
       ShadowBase);
-  Value *SrcShadowData =
-      IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy());
+  Value *SrcShadowData = IRB.CreateIntToPtr(SrcShadowDataInt, IRB.getPtrTy());
 
   if (NeedsMemMove) {
     IRB.CreateMemMove(ShadowData, Align(1u << PtrShift), SrcShadowData,
diff --git a/llvm/test/Instrumentation/TypeSanitizer/access-with-offfset.ll b/llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll
similarity index 100%
rename from llvm/test/Instrumentation/TypeSanitizer/access-with-offfset.ll
rename to llvm/test/Instrumentation/TypeSanitizer/access-with-offset.ll

>From 98ece165fc5a6476ddf08949e50b3b34228f9f32 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 18 Apr 2024 23:01:03 +0100
Subject: [PATCH 15/18] [TySan] A Type Sanitizer (Clang)

---
 clang/include/clang/Basic/Features.def     |  1 +
 clang/include/clang/Basic/Sanitizers.def   |  3 ++
 clang/include/clang/Driver/SanitizerArgs.h |  1 +
 clang/lib/CodeGen/BackendUtil.cpp          |  6 ++++
 clang/lib/CodeGen/CGDecl.cpp               |  3 +-
 clang/lib/CodeGen/CGDeclCXX.cpp            |  4 +++
 clang/lib/CodeGen/CodeGenFunction.cpp      |  2 ++
 clang/lib/CodeGen/CodeGenModule.cpp        | 12 ++++---
 clang/lib/CodeGen/CodeGenTBAA.cpp          |  6 ++--
 clang/lib/CodeGen/SanitizerMetadata.cpp    | 40 +++++++++++++++++-----
 clang/lib/CodeGen/SanitizerMetadata.h      | 13 +++----
 clang/lib/Driver/SanitizerArgs.cpp         | 13 ++++---
 clang/lib/Driver/ToolChains/CommonArgs.cpp |  6 +++-
 clang/lib/Driver/ToolChains/Darwin.cpp     |  6 ++++
 clang/lib/Driver/ToolChains/Linux.cpp      |  2 ++
 clang/test/Driver/sanitizer-ld.c           | 23 +++++++++++++
 16 files changed, 114 insertions(+), 27 deletions(-)

diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index 15c59c6bcdf29c..c82b6d9b5f6c10 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -102,6 +102,7 @@ FEATURE(numerical_stability_sanitizer, LangOpts.Sanitize.has(SanitizerKind::Nume
 FEATURE(memory_sanitizer,
         LangOpts.Sanitize.hasOneOf(SanitizerKind::Memory |
                                    SanitizerKind::KernelMemory))
+FEATURE(type_sanitizer, LangOpts.Sanitize.has(SanitizerKind::Type))
 FEATURE(thread_sanitizer, LangOpts.Sanitize.has(SanitizerKind::Thread))
 FEATURE(dataflow_sanitizer, LangOpts.Sanitize.has(SanitizerKind::DataFlow))
 FEATURE(scudo, LangOpts.Sanitize.hasOneOf(SanitizerKind::Scudo))
diff --git a/clang/include/clang/Basic/Sanitizers.def b/clang/include/clang/Basic/Sanitizers.def
index 9223f62b3639a7..f234488eaa80cf 100644
--- a/clang/include/clang/Basic/Sanitizers.def
+++ b/clang/include/clang/Basic/Sanitizers.def
@@ -73,6 +73,9 @@ SANITIZER("fuzzer", Fuzzer)
 // libFuzzer-required instrumentation, no linking.
 SANITIZER("fuzzer-no-link", FuzzerNoLink)
 
+// TypeSanitizer
+SANITIZER("type", Type)
+
 // ThreadSanitizer
 SANITIZER("thread", Thread)
 
diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h
index 0c6f3869549ef7..4f08ea2b260179 100644
--- a/clang/include/clang/Driver/SanitizerArgs.h
+++ b/clang/include/clang/Driver/SanitizerArgs.h
@@ -87,6 +87,7 @@ class SanitizerArgs {
   bool needsHwasanAliasesRt() const {
     return needsHwasanRt() && HwasanUseAliases;
   }
+  bool needsTysanRt() const { return Sanitizers.has(SanitizerKind::Type); }
   bool needsTsanRt() const { return Sanitizers.has(SanitizerKind::Thread); }
   bool needsMsanRt() const { return Sanitizers.has(SanitizerKind::Memory); }
   bool needsFuzzer() const { return Sanitizers.has(SanitizerKind::Fuzzer); }
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index bf9b04f02e9f44..014dc5cdeb616e 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -77,6 +77,7 @@
 #include "llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h"
 #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
 #include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
+#include "llvm/Transforms/Instrumentation/TypeSanitizer.h"
 #include "llvm/Transforms/ObjCARC.h"
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
 #include "llvm/Transforms/Scalar/GVN.h"
@@ -735,6 +736,11 @@ static void addSanitizers(const Triple &TargetTriple,
       MPM.addPass(createModuleToFunctionPassAdaptor(ThreadSanitizerPass()));
     }
 
+    if (LangOpts.Sanitize.has(SanitizerKind::Type)) {
+      MPM.addPass(ModuleTypeSanitizerPass());
+      MPM.addPass(createModuleToFunctionPassAdaptor(TypeSanitizerPass()));
+    }
+
     if (LangOpts.Sanitize.has(SanitizerKind::NumericalStability))
       MPM.addPass(NumericalStabilitySanitizerPass());
 
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 47b21bc9f63f04..bb9d120c37ca86 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -458,7 +458,8 @@ void CodeGenFunction::EmitStaticVarDecl(const VarDecl &D,
   LocalDeclMap.find(&D)->second = Address(castedAddr, elemTy, alignment);
   CGM.setStaticLocalDeclAddress(&D, castedAddr);
 
-  CGM.getSanitizerMetadata()->reportGlobal(var, D);
+  CGM.getSanitizerMetadata()->reportGlobalToASan(var, D);
+  CGM.getSanitizerMetadata()->reportGlobalToTySan(var, D);
 
   // Emit global variable debug descriptor for static vars.
   CGDebugInfo *DI = getDebugInfo();
diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp
index 2c3054605ee754..96517511b21114 100644
--- a/clang/lib/CodeGen/CGDeclCXX.cpp
+++ b/clang/lib/CodeGen/CGDeclCXX.cpp
@@ -479,6 +479,10 @@ llvm::Function *CodeGenModule::CreateGlobalInitOrCleanUpFunction(
       !isInNoSanitizeList(SanitizerKind::MemtagStack, Fn, Loc))
     Fn->addFnAttr(llvm::Attribute::SanitizeMemTag);
 
+  if (getLangOpts().Sanitize.has(SanitizerKind::Type) &&
+      !isInNoSanitizeList(SanitizerKind::Type, Fn, Loc))
+    Fn->addFnAttr(llvm::Attribute::SanitizeType);
+
   if (getLangOpts().Sanitize.has(SanitizerKind::Thread) &&
       !isInNoSanitizeList(SanitizerKind::Thread, Fn, Loc))
     Fn->addFnAttr(llvm::Attribute::SanitizeThread);
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 2bc10cdd2d3441..af58fa64f86585 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -837,6 +837,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
       Fn->addFnAttr(llvm::Attribute::SanitizeMemTag);
     if (SanOpts.has(SanitizerKind::Thread))
       Fn->addFnAttr(llvm::Attribute::SanitizeThread);
+    if (SanOpts.has(SanitizerKind::Type))
+      Fn->addFnAttr(llvm::Attribute::SanitizeType);
     if (SanOpts.has(SanitizerKind::NumericalStability))
       Fn->addFnAttr(llvm::Attribute::SanitizeNumericalStability);
     if (SanOpts.hasOneOf(SanitizerKind::Memory | SanitizerKind::KernelMemory))
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index d3d5c0743a520b..a2f6a8a481113d 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -397,8 +397,8 @@ CodeGenModule::CodeGenModule(ASTContext &C,
   if (LangOpts.HLSL)
     createHLSLRuntime();
 
-  // Enable TBAA unless it's suppressed. ThreadSanitizer needs TBAA even at O0.
-  if (LangOpts.Sanitize.has(SanitizerKind::Thread) ||
+  // Enable TBAA unless it's suppressed. TSan and TySan need TBAA even at O0.
+  if (LangOpts.Sanitize.hasOneOf(SanitizerKind::Thread | SanitizerKind::Type) ||
       (!CodeGenOpts.RelaxedAliasing && CodeGenOpts.OptimizationLevel > 0))
     TBAA.reset(new CodeGenTBAA(Context, getTypes(), TheModule, CodeGenOpts,
                                getLangOpts()));
@@ -5162,7 +5162,7 @@ CodeGenModule::GetOrCreateLLVMGlobal(StringRef MangledName, llvm::Type *Ty,
   }
 
   if (D)
-    SanitizerMD->reportGlobal(GV, *D);
+    SanitizerMD->reportGlobalToASan(GV, *D);
 
   LangAS ExpectedAS =
       D ? D->getType().getAddressSpace()
@@ -5728,7 +5728,8 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
   if (NeedsGlobalCtor || NeedsGlobalDtor)
     EmitCXXGlobalVarDeclInitFunc(D, GV, NeedsGlobalCtor);
 
-  SanitizerMD->reportGlobal(GV, *D, NeedsGlobalCtor);
+  SanitizerMD->reportGlobalToASan(GV, *D, NeedsGlobalCtor);
+  SanitizerMD->reportGlobalToTySan(GV, *D);
 
   // Emit global variable debug information.
   if (CGDebugInfo *DI = getModuleDebugInfo())
@@ -6618,7 +6619,8 @@ CodeGenModule::GetAddrOfConstantStringFromLiteral(const StringLiteral *S,
   if (Entry)
     *Entry = GV;
 
-  SanitizerMD->reportGlobal(GV, S->getStrTokenLoc(0), "<string literal>");
+  SanitizerMD->reportGlobalToASan(GV, S->getStrTokenLoc(0), "<string literal>");
+  // FIXME: Should we also report to the TySan?
 
   return ConstantAddress(castStringLiteralToDefaultAddressSpace(*this, GV),
                          GV->getValueType(), Alignment);
diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp
index 6eed8e1d2b671a..75e66bae79afdc 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -314,8 +314,10 @@ llvm::MDNode *CodeGenTBAA::getTypeInfoHelper(const Type *Ty) {
 }
 
 llvm::MDNode *CodeGenTBAA::getTypeInfo(QualType QTy) {
-  // At -O0 or relaxed aliasing, TBAA is not emitted for regular types.
-  if (CodeGenOpts.OptimizationLevel == 0 || CodeGenOpts.RelaxedAliasing)
+  // At -O0 or relaxed aliasing, TBAA is not emitted for regular types (unless
+  // we're running TypeSanitizer).
+  if (!Features.Sanitize.has(SanitizerKind::Type) &&
+      (CodeGenOpts.OptimizationLevel == 0 || CodeGenOpts.RelaxedAliasing))
     return nullptr;
 
   // If the type has the may_alias attribute (even on a typedef), it is
diff --git a/clang/lib/CodeGen/SanitizerMetadata.cpp b/clang/lib/CodeGen/SanitizerMetadata.cpp
index c1a6b223480a19..c551a2529805c1 100644
--- a/clang/lib/CodeGen/SanitizerMetadata.cpp
+++ b/clang/lib/CodeGen/SanitizerMetadata.cpp
@@ -31,11 +31,11 @@ static SanitizerMask expandKernelSanitizerMasks(SanitizerMask Mask) {
   return Mask;
 }
 
-void SanitizerMetadata::reportGlobal(llvm::GlobalVariable *GV,
-                                     SourceLocation Loc, StringRef Name,
-                                     QualType Ty,
-                                     SanitizerMask NoSanitizeAttrMask,
-                                     bool IsDynInit) {
+void SanitizerMetadata::reportGlobalToASan(llvm::GlobalVariable *GV,
+                                           SourceLocation Loc, StringRef Name,
+                                           QualType Ty,
+                                           SanitizerMask NoSanitizeAttrMask,
+                                           bool IsDynInit) {
   SanitizerSet FsanitizeArgument = CGM.getLangOpts().Sanitize;
   if (!isAsanHwasanOrMemTag(FsanitizeArgument))
     return;
@@ -72,8 +72,8 @@ void SanitizerMetadata::reportGlobal(llvm::GlobalVariable *GV,
   GV->setSanitizerMetadata(Meta);
 }
 
-void SanitizerMetadata::reportGlobal(llvm::GlobalVariable *GV, const VarDecl &D,
-                                     bool IsDynInit) {
+void SanitizerMetadata::reportGlobalToASan(llvm::GlobalVariable *GV,
+                                           const VarDecl &D, bool IsDynInit) {
   if (!isAsanHwasanOrMemTag(CGM.getLangOpts().Sanitize))
     return;
   std::string QualName;
@@ -95,6 +95,30 @@ void SanitizerMetadata::reportGlobal(llvm::GlobalVariable *GV, const VarDecl &D,
                IsDynInit);
 }
 
+void SanitizerMetadata::reportGlobalToTySan(llvm::GlobalVariable *GV,
+                                            const VarDecl &D) {
+  if (!CGM.getLangOpts().Sanitize.has(SanitizerKind::Type))
+    return;
+
+  for (auto Attr : D.specific_attrs<NoSanitizeAttr>())
+    if (Attr->getMask() & SanitizerKind::Type)
+      return;
+
+  QualType QTy = D.getType();
+  llvm::MDNode *TBAAInfo = CGM.getTBAATypeInfo(QTy);
+  if (!TBAAInfo || TBAAInfo == CGM.getTBAATypeInfo(CGM.getContext().CharTy))
+    return;
+
+  llvm::Metadata *GlobalMetadata[] = {llvm::ConstantAsMetadata::get(GV),
+                                      TBAAInfo};
+
+  llvm::MDNode *ThisGlobal =
+      llvm::MDNode::get(CGM.getLLVMContext(), GlobalMetadata);
+  llvm::NamedMDNode *TysanGlobals =
+      CGM.getModule().getOrInsertNamedMetadata("llvm.tysan.globals");
+  TysanGlobals->addOperand(ThisGlobal);
+}
+
 void SanitizerMetadata::disableSanitizerForGlobal(llvm::GlobalVariable *GV) {
-  reportGlobal(GV, SourceLocation(), "", QualType(), SanitizerKind::All);
+  reportGlobalToASan(GV, SourceLocation(), "", QualType(), SanitizerKind::All);
 }
diff --git a/clang/lib/CodeGen/SanitizerMetadata.h b/clang/lib/CodeGen/SanitizerMetadata.h
index 000f02cf8dcf11..9de087c518c6ad 100644
--- a/clang/lib/CodeGen/SanitizerMetadata.h
+++ b/clang/lib/CodeGen/SanitizerMetadata.h
@@ -37,12 +37,13 @@ class SanitizerMetadata {
 
 public:
   SanitizerMetadata(CodeGenModule &CGM);
-  void reportGlobal(llvm::GlobalVariable *GV, const VarDecl &D,
-                    bool IsDynInit = false);
-  void reportGlobal(llvm::GlobalVariable *GV, SourceLocation Loc,
-                    StringRef Name, QualType Ty = {},
-                    SanitizerMask NoSanitizeAttrMask = {},
-                    bool IsDynInit = false);
+  void reportGlobalToASan(llvm::GlobalVariable *GV, const VarDecl &D,
+                          bool IsDynInit = false);
+  void reportGlobalToASan(llvm::GlobalVariable *GV, SourceLocation Loc,
+                          StringRef Name, QualType Ty = {},
+                          SanitizerMask NoSanitizeAttrMask = {},
+                          bool IsDynInit = false);
+  void reportGlobalToTySan(llvm::GlobalVariable *GV, const VarDecl &D);
   void disableSanitizerForGlobal(llvm::GlobalVariable *GV);
 };
 } // end namespace CodeGen
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 1abfe8fd92807e..e826cd627693f4 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -37,15 +37,15 @@ static const SanitizerMask NotAllowedWithMinimalRuntime = SanitizerKind::Vptr;
 static const SanitizerMask NotAllowedWithExecuteOnly =
     SanitizerKind::Function | SanitizerKind::KCFI;
 static const SanitizerMask NeedsUnwindTables =
-    SanitizerKind::Address | SanitizerKind::HWAddress | SanitizerKind::Thread |
+    SanitizerKind::Address | SanitizerKind::HWAddress | SanitizerKind::Type | SanitizerKind::Thread |
     SanitizerKind::Memory | SanitizerKind::DataFlow |
     SanitizerKind::NumericalStability;
 static const SanitizerMask SupportsCoverage =
     SanitizerKind::Address | SanitizerKind::HWAddress |
     SanitizerKind::KernelAddress | SanitizerKind::KernelHWAddress |
-    SanitizerKind::MemtagStack | SanitizerKind::MemtagHeap |
-    SanitizerKind::MemtagGlobals | SanitizerKind::Memory |
-    SanitizerKind::KernelMemory | SanitizerKind::Leak |
+    SanitizerKind::Type | SanitizerKind::MemtagStack |
+    SanitizerKind::MemtagHeap | SanitizerKind::MemtagGlobals |
+    SanitizerKind::Memory | SanitizerKind::KernelMemory | SanitizerKind::Leak |
     SanitizerKind::Undefined | SanitizerKind::Integer | SanitizerKind::Bounds |
     SanitizerKind::ImplicitConversion | SanitizerKind::Nullability |
     SanitizerKind::DataFlow | SanitizerKind::Fuzzer |
@@ -182,6 +182,7 @@ static void addDefaultIgnorelists(const Driver &D, SanitizerMask Kinds,
                      {"msan_ignorelist.txt", SanitizerKind::Memory},
                      {"nsan_ignorelist.txt", SanitizerKind::NumericalStability},
                      {"tsan_ignorelist.txt", SanitizerKind::Thread},
+                     {"tysan_blacklist.txt", SanitizerKind::Type},
                      {"dfsan_abilist.txt", SanitizerKind::DataFlow},
                      {"cfi_ignorelist.txt", SanitizerKind::CFI},
                      {"ubsan_ignorelist.txt",
@@ -526,6 +527,10 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
   std::pair<SanitizerMask, SanitizerMask> IncompatibleGroups[] = {
       std::make_pair(SanitizerKind::Address,
                      SanitizerKind::Thread | SanitizerKind::Memory),
+      std::make_pair(SanitizerKind::Type,
+                     SanitizerKind::Address | SanitizerKind::KernelAddress |
+                         SanitizerKind::Memory | SanitizerKind::Leak |
+                         SanitizerKind::Thread | SanitizerKind::KernelAddress),
       std::make_pair(SanitizerKind::Thread, SanitizerKind::Memory),
       std::make_pair(SanitizerKind::Leak,
                      SanitizerKind::Thread | SanitizerKind::Memory),
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 8d977149e62485..b29bad6f908798 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1441,8 +1441,10 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args,
     if (SanArgs.needsScudoRt()) {
       SharedRuntimes.push_back("scudo_standalone");
     }
-    if (SanArgs.needsTsanRt())
+    if (SanArgs.needsTsanRt() && SanArgs.linkRuntimes())
       SharedRuntimes.push_back("tsan");
+    if (SanArgs.needsTysanRt())
+      StaticRuntimes.push_back("tysan");
     if (SanArgs.needsHwasanRt()) {
       if (SanArgs.needsHwasanAliasesRt())
         SharedRuntimes.push_back("hwasan_aliases");
@@ -1515,6 +1517,8 @@ collectSanitizerRuntimes(const ToolChain &TC, const ArgList &Args,
     if (SanArgs.linkCXXRuntimes())
       StaticRuntimes.push_back("tsan_cxx");
   }
+  if (!SanArgs.needsSharedRt() && SanArgs.needsTysanRt())
+    StaticRuntimes.push_back("tysan");
   if (!SanArgs.needsSharedRt() && SanArgs.needsUbsanRt()) {
     if (SanArgs.requiresMinimalRuntime()) {
       StaticRuntimes.push_back("ubsan_minimal");
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 87380869f6fdab..7bd3179deb227a 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -1596,6 +1596,8 @@ void DarwinClang::AddLinkRuntimeLibArgs(const ArgList &Args,
              "Static sanitizer runtimes not supported");
       AddLinkSanitizerLibArgs(Args, CmdArgs, "tsan");
     }
+    if (Sanitize.needsTysanRt())
+      AddLinkSanitizerLibArgs(Args, CmdArgs, "tysan");
     if (Sanitize.needsFuzzer() && !Args.hasArg(options::OPT_dynamiclib)) {
       AddLinkSanitizerLibArgs(Args, CmdArgs, "fuzzer", /*shared=*/false);
 
@@ -3599,6 +3601,10 @@ SanitizerMask Darwin::getSupportedSanitizers() const {
     Res |= SanitizerKind::Thread;
   }
 
+  if ((IsX86_64 || IsAArch64) && isTargetMacOSBased()) {
+    Res |= SanitizerKind::Type;
+  }
+
   if (IsX86_64)
     Res |= SanitizerKind::NumericalStability;
 
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 46962e88d45502..c91b55b5a2948c 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -837,6 +837,8 @@ SanitizerMask Linux::getSupportedSanitizers() const {
   if (IsX86_64 || IsMIPS64 || IsAArch64 || IsPowerPC64 || IsSystemZ ||
       IsLoongArch64 || IsRISCV64)
     Res |= SanitizerKind::Thread;
+  if (IsX86_64 || IsAArch64)
+    Res |= SanitizerKind::Type;
   if (IsX86_64 || IsSystemZ || IsPowerPC64)
     Res |= SanitizerKind::KernelMemory;
   if (IsX86_64 || IsMIPS64 || IsAArch64 || IsX86 || IsMIPS || IsArmArch ||
diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c
index c83066a334001a..60d60a6047b0f4 100644
--- a/clang/test/Driver/sanitizer-ld.c
+++ b/clang/test/Driver/sanitizer-ld.c
@@ -274,6 +274,29 @@
 // CHECK-ASAN-ANDROID-SHARED-NOT: "-lpthread"
 // CHECK-ASAN-ANDROID-SHARED-NOT: "-lresolv"
 
+
+// RUN: %clangxx %s -### -o %t.o 2>&1 \
+// RUN:     --target=x86_64-unknown-linux -fuse-ld=ld -stdlib=platform -lstdc++ \
+// RUN:     -fsanitize=type \
+// RUN:     -resource-dir=%S/Inputs/resource_dir \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-TYSAN-LINUX-CXX %s
+//
+// CHECK-TYSAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
+// CHECK-TYSAN-LINUX-CXX-NOT: stdc++
+// CHECK-TYSAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.tysan{{[^.]*}}.a" "--no-whole-archive"
+// CHECK-TYSAN-LINUX-CXX: stdc++
+
+// RUN: %clangxx -fsanitize=type -### %s 2>&1 \
+// RUN:     -mmacosx-version-min=10.6 \
+// RUN:     --target=x86_64-apple-darwin13.4.0 -fuse-ld=ld -stdlib=platform \
+// RUN:     -resource-dir=%S/Inputs/resource_dir \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-TYSAN-DARWIN-CXX %s
+// CHECK-TYSAN-DARWIN-CXX: "{{.*}}ld{{(.exe)?}}"
+// CHECK-TYSAN-DARWIN-CXX: libclang_rt.tysan_osx_dynamic.dylib
+// CHECK-TYSAN-DARWIN-CXX-NOT: -lc++abi
+
 // RUN: %clangxx -### %s 2>&1 \
 // RUN:     --target=x86_64-unknown-linux -fuse-ld=ld -stdlib=platform -lstdc++ \
 // RUN:     -fsanitize=thread \

>From b08626a35d9f55f25ebdc381699302da0f29bad2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Thu, 18 Apr 2024 23:03:05 +0100
Subject: [PATCH 16/18] !fixup: add test

---
 clang/lib/CodeGen/SanitizerMetadata.cpp   |  2 +-
 clang/test/CodeGen/sanitize-type-attr.cpp | 74 +++++++++++++++++++++++
 2 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGen/sanitize-type-attr.cpp

diff --git a/clang/lib/CodeGen/SanitizerMetadata.cpp b/clang/lib/CodeGen/SanitizerMetadata.cpp
index c551a2529805c1..903ee65dd3eaa4 100644
--- a/clang/lib/CodeGen/SanitizerMetadata.cpp
+++ b/clang/lib/CodeGen/SanitizerMetadata.cpp
@@ -91,7 +91,7 @@ void SanitizerMetadata::reportGlobalToASan(llvm::GlobalVariable *GV,
     return NoSanitizeMask;
   };
 
-  reportGlobal(GV, D.getLocation(), QualName, D.getType(), getNoSanitizeMask(D),
+  reportGlobalToASan(GV, D.getLocation(), QualName, D.getType(), getNoSanitizeMask(D),
                IsDynInit);
 }
 
diff --git a/clang/test/CodeGen/sanitize-type-attr.cpp b/clang/test/CodeGen/sanitize-type-attr.cpp
new file mode 100644
index 00000000000000..4da8488e1f9486
--- /dev/null
+++ b/clang/test/CodeGen/sanitize-type-attr.cpp
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s | FileCheck -check-prefix=WITHOUT %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s -fsanitize=type | FileCheck -check-prefix=TYSAN %s
+// RUN: echo "src:%s" | sed -e 's/\\/\\\\/g' > %t
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s -fsanitize=type -fsanitize-blacklist=%t | FileCheck -check-prefix=BL %s
+
+// The sanitize_type attribute should be attached to functions
+// when TypeSanitizer is enabled, unless no_sanitize("type") attribute
+// is present.
+
+// WITHOUT:  NoTYSAN1{{.*}}) [[NOATTR:#[0-9]+]]
+// BL:  NoTYSAN1{{.*}}) [[NOATTR:#[0-9]+]]
+// TYSAN:  NoTYSAN1{{.*}}) [[NOATTR:#[0-9]+]]
+__attribute__((no_sanitize("type"))) int NoTYSAN1(int *a) { return *a; }
+
+// WITHOUT:  NoTYSAN2{{.*}}) [[NOATTR]]
+// BL:  NoTYSAN2{{.*}}) [[NOATTR]]
+// TYSAN:  NoTYSAN2{{.*}}) [[NOATTR]]
+__attribute__((no_sanitize("type"))) int NoTYSAN2(int *a);
+int NoTYSAN2(int *a) { return *a; }
+
+// WITHOUT:  NoTYSAN3{{.*}}) [[NOATTR:#[0-9]+]]
+// BL:  NoTYSAN3{{.*}}) [[NOATTR:#[0-9]+]]
+// TYSAN:  NoTYSAN3{{.*}}) [[NOATTR:#[0-9]+]]
+__attribute__((no_sanitize("type"))) int NoTYSAN3(int *a) { return *a; }
+
+// WITHOUT:  TYSANOk{{.*}}) [[NOATTR]]
+// BL:  TYSANOk{{.*}}) [[NOATTR]]
+// TYSAN: TYSANOk{{.*}}) [[WITH:#[0-9]+]]
+int TYSANOk(int *a) { return *a; }
+
+// WITHOUT:  TemplateTYSANOk{{.*}}) [[NOATTR]]
+// BL:  TemplateTYSANOk{{.*}}) [[NOATTR]]
+// TYSAN: TemplateTYSANOk{{.*}}) [[WITH]]
+template <int i>
+int TemplateTYSANOk() { return i; }
+
+// WITHOUT:  TemplateNoTYSAN{{.*}}) [[NOATTR]]
+// BL:  TemplateNoTYSAN{{.*}}) [[NOATTR]]
+// TYSAN: TemplateNoTYSAN{{.*}}) [[NOATTR]]
+template <int i>
+__attribute__((no_sanitize("type"))) int TemplateNoTYSAN() { return i; }
+
+int force_instance = TemplateTYSANOk<42>() + TemplateNoTYSAN<42>();
+
+// Check that __cxx_global_var_init* get the sanitize_type attribute.
+int global1 = 0;
+int global2 = *(int *)((char *)&global1 + 1);
+// WITHOUT: @__cxx_global_var_init{{.*}}[[NOATTR:#[0-9]+]]
+// BL: @__cxx_global_var_init{{.*}}[[NOATTR:#[0-9]+]]
+// TYSAN: @__cxx_global_var_init{{.*}}[[WITH:#[0-9]+]]
+
+// Make sure that we don't add globals to the list for which we don't have a
+// specific type description.
+// FIXME: We now have a type description for this type and a global is added. Should it?
+struct SX {
+  int a, b;
+};
+SX sx;
+
+// WITHOUT: attributes [[NOATTR]] = { noinline nounwind{{.*}} }
+
+// BL: attributes [[NOATTR]] = { noinline nounwind{{.*}} }
+
+// TYSAN: attributes [[NOATTR]] = { mustprogress noinline nounwind{{.*}} }
+// TYSAN: attributes [[WITH]] = { noinline nounwind sanitize_type{{.*}} }
+
+// TYSAN-DAG: !llvm.tysan.globals = !{[[G1MD:![0-9]+]], [[G2MD:![0-9]+]], [[G3MD:![0-9]+]], [[SXMD:![0-9]+]]}
+// TYSAN-DAG: [[G1MD]] = !{ptr @force_instance, [[INTMD:![0-9]+]]}
+// TYSAN-DAG: [[INTMD]] = !{!"int",
+// TYSAN-DAG: [[G2MD]] = !{ptr @global1, [[INTMD]]}
+// TYSAN-DAG: [[G3MD]] = !{ptr @global2, [[INTMD]]}
+// TYSAN-DAG: [[SXMD]] = !{ptr @sx, [[SXTYMD:![0-9]+]]}
+// TYSAN-DAG: [[SXTYMD]] = !{!"_ZTS2SX", [[INTMD]], i64 0, !1, i64 4}
+// TYSAN-DAG: Simple C++ TBAA

>From bdd4f6913cf72f3ed8feba76bfe0859de7bd7003 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 22 Nov 2024 15:01:41 +0000
Subject: [PATCH 17/18] [TySan] A Type Sanitizer (Runtime Library)

---
 clang/runtime/CMakeLists.txt                  |   2 +-
 .../cmake/Modules/AllSupportedArchDefs.cmake  |   1 +
 compiler-rt/cmake/config-ix.cmake             |  15 +-
 compiler-rt/lib/tysan/CMakeLists.txt          |  64 ++++
 compiler-rt/lib/tysan/lit.cfg                 |  35 ++
 compiler-rt/lib/tysan/lit.site.cfg.in         |  12 +
 compiler-rt/lib/tysan/tysan.cpp               | 344 ++++++++++++++++++
 compiler-rt/lib/tysan/tysan.h                 |  79 ++++
 compiler-rt/lib/tysan/tysan.syms.extra        |   2 +
 compiler-rt/lib/tysan/tysan_flags.inc         |  17 +
 compiler-rt/lib/tysan/tysan_interceptors.cpp  | 250 +++++++++++++
 compiler-rt/lib/tysan/tysan_platform.h        |  93 +++++
 compiler-rt/test/tysan/CMakeLists.txt         |  32 ++
 compiler-rt/test/tysan/anon-ns.cpp            |  41 +++
 compiler-rt/test/tysan/anon-same-struct.c     |  26 ++
 compiler-rt/test/tysan/anon-struct.c          |  27 ++
 compiler-rt/test/tysan/basic.c                |  65 ++++
 compiler-rt/test/tysan/char-memcpy.c          |  45 +++
 .../test/tysan/constexpr-subobject.cpp        |  25 ++
 compiler-rt/test/tysan/global.c               |  31 ++
 compiler-rt/test/tysan/int-long.c             |  21 ++
 compiler-rt/test/tysan/lit.cfg.py             | 139 +++++++
 compiler-rt/test/tysan/lit.site.cfg.py.in     |  17 +
 compiler-rt/test/tysan/ptr-float.c            |  19 +
 ...ruct-offset-multiple-compilation-units.cpp |  51 +++
 compiler-rt/test/tysan/struct-offset.c        |  26 ++
 compiler-rt/test/tysan/struct.c               |  39 ++
 compiler-rt/test/tysan/union-wr-wr.c          |  18 +
 compiler-rt/test/tysan/violation-pr45282.c    |  32 ++
 compiler-rt/test/tysan/violation-pr47137.c    |  40 ++
 compiler-rt/test/tysan/violation-pr51837.c    |  34 ++
 compiler-rt/test/tysan/violation-pr62544.c    |  24 ++
 compiler-rt/test/tysan/violation-pr62828.cpp  |  44 +++
 compiler-rt/test/tysan/violation-pr68655.cpp  |  40 ++
 compiler-rt/test/tysan/violation-pr86685.c    |  29 ++
 35 files changed, 1777 insertions(+), 2 deletions(-)
 create mode 100644 compiler-rt/lib/tysan/CMakeLists.txt
 create mode 100644 compiler-rt/lib/tysan/lit.cfg
 create mode 100644 compiler-rt/lib/tysan/lit.site.cfg.in
 create mode 100644 compiler-rt/lib/tysan/tysan.cpp
 create mode 100644 compiler-rt/lib/tysan/tysan.h
 create mode 100644 compiler-rt/lib/tysan/tysan.syms.extra
 create mode 100644 compiler-rt/lib/tysan/tysan_flags.inc
 create mode 100644 compiler-rt/lib/tysan/tysan_interceptors.cpp
 create mode 100644 compiler-rt/lib/tysan/tysan_platform.h
 create mode 100644 compiler-rt/test/tysan/CMakeLists.txt
 create mode 100644 compiler-rt/test/tysan/anon-ns.cpp
 create mode 100644 compiler-rt/test/tysan/anon-same-struct.c
 create mode 100644 compiler-rt/test/tysan/anon-struct.c
 create mode 100644 compiler-rt/test/tysan/basic.c
 create mode 100644 compiler-rt/test/tysan/char-memcpy.c
 create mode 100644 compiler-rt/test/tysan/constexpr-subobject.cpp
 create mode 100644 compiler-rt/test/tysan/global.c
 create mode 100644 compiler-rt/test/tysan/int-long.c
 create mode 100644 compiler-rt/test/tysan/lit.cfg.py
 create mode 100644 compiler-rt/test/tysan/lit.site.cfg.py.in
 create mode 100644 compiler-rt/test/tysan/ptr-float.c
 create mode 100644 compiler-rt/test/tysan/struct-offset-multiple-compilation-units.cpp
 create mode 100644 compiler-rt/test/tysan/struct-offset.c
 create mode 100644 compiler-rt/test/tysan/struct.c
 create mode 100644 compiler-rt/test/tysan/union-wr-wr.c
 create mode 100644 compiler-rt/test/tysan/violation-pr45282.c
 create mode 100644 compiler-rt/test/tysan/violation-pr47137.c
 create mode 100644 compiler-rt/test/tysan/violation-pr51837.c
 create mode 100644 compiler-rt/test/tysan/violation-pr62544.c
 create mode 100644 compiler-rt/test/tysan/violation-pr62828.cpp
 create mode 100644 compiler-rt/test/tysan/violation-pr68655.cpp
 create mode 100644 compiler-rt/test/tysan/violation-pr86685.c

diff --git a/clang/runtime/CMakeLists.txt b/clang/runtime/CMakeLists.txt
index 65fcdc2868f031..ff2605b23d25b0 100644
--- a/clang/runtime/CMakeLists.txt
+++ b/clang/runtime/CMakeLists.txt
@@ -122,7 +122,7 @@ if(LLVM_BUILD_EXTERNAL_COMPILER_RT AND EXISTS ${COMPILER_RT_SRC_ROOT}/)
                            COMPONENT compiler-rt)
 
   # Add top-level targets that build specific compiler-rt runtimes.
-  set(COMPILER_RT_RUNTIMES fuzzer asan builtins dfsan lsan msan profile tsan ubsan ubsan-minimal)
+  set(COMPILER_RT_RUNTIMES fuzzer asan builtins dfsan lsan msan profile tsan tysan ubsan ubsan-minimal)
   foreach(runtime ${COMPILER_RT_RUNTIMES})
     get_ext_project_build_command(build_runtime_cmd ${runtime})
     add_custom_target(${runtime}
diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
index b29ae179c2b4f4..ad6784c7ba8833 100644
--- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
+++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
@@ -85,6 +85,7 @@ else()
   set(ALL_TSAN_SUPPORTED_ARCH ${X86_64} ${MIPS64} ${ARM64} ${PPC64} ${S390X}
       ${LOONGARCH64} ${RISCV64})
 endif()
+set(ALL_TYSAN_SUPPORTED_ARCH ${X86_64} ${ARM64})
 set(ALL_UBSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${RISCV64}
     ${MIPS32} ${MIPS64} ${PPC64} ${S390X} ${SPARC} ${SPARCV9} ${HEXAGON}
     ${LOONGARCH64})
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index 6d52eecc9a91fe..cf729c3adb1f5f 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -458,6 +458,7 @@ if(APPLE)
   set(SANITIZER_COMMON_SUPPORTED_OS osx)
   set(PROFILE_SUPPORTED_OS osx)
   set(TSAN_SUPPORTED_OS osx)
+  set(TYSAN_SUPPORTED_OS osx)
   set(XRAY_SUPPORTED_OS osx)
   set(FUZZER_SUPPORTED_OS osx)
   set(ORC_SUPPORTED_OS)
@@ -593,6 +594,7 @@ if(APPLE)
           list(APPEND FUZZER_SUPPORTED_OS ${platform})
           list(APPEND ORC_SUPPORTED_OS ${platform})
           list(APPEND UBSAN_SUPPORTED_OS ${platform})
+          list(APPEND TYSAN_SUPPORTED_OS ${platform})
           list(APPEND LSAN_SUPPORTED_OS ${platform})
           list(APPEND STATS_SUPPORTED_OS ${platform})
         endif()
@@ -651,6 +653,9 @@ if(APPLE)
   list_intersect(CTX_PROFILE_SUPPORTED_ARCH
     ALL_CTX_PROFILE_SUPPORTED_ARCH
     SANITIZER_COMMON_SUPPORTED_ARCH)
+  list_intersect(TYSAN_SUPPORTED_ARCH
+    ALL_TYSAN_SUPPORTED_ARCH
+    SANITIZER_COMMON_SUPPORTED_ARCH)
   list_intersect(TSAN_SUPPORTED_ARCH
     ALL_TSAN_SUPPORTED_ARCH
     SANITIZER_COMMON_SUPPORTED_ARCH)
@@ -703,6 +708,7 @@ else()
   filter_available_targets(PROFILE_SUPPORTED_ARCH ${ALL_PROFILE_SUPPORTED_ARCH})
   filter_available_targets(CTX_PROFILE_SUPPORTED_ARCH ${ALL_CTX_PROFILE_SUPPORTED_ARCH})
   filter_available_targets(TSAN_SUPPORTED_ARCH ${ALL_TSAN_SUPPORTED_ARCH})
+  filter_available_targets(TYSAN_SUPPORTED_ARCH ${ALL_TYSAN_SUPPORTED_ARCH})
   filter_available_targets(UBSAN_SUPPORTED_ARCH ${ALL_UBSAN_SUPPORTED_ARCH})
   filter_available_targets(SAFESTACK_SUPPORTED_ARCH
     ${ALL_SAFESTACK_SUPPORTED_ARCH})
@@ -748,7 +754,7 @@ if(COMPILER_RT_SUPPORTED_ARCH)
 endif()
 message(STATUS "Compiler-RT supported architectures: ${COMPILER_RT_SUPPORTED_ARCH}")
 
-set(ALL_SANITIZERS asan;rtsan;dfsan;msan;hwasan;tsan;safestack;cfi;scudo_standalone;ubsan_minimal;gwp_asan;nsan;asan_abi)
+set(ALL_SANITIZERS asan;rtsan;dfsan;msan;hwasan;tsan;tysan;safestack;cfi;scudo_standalone;ubsan_minimal;gwp_asan;nsan;asan_abi)
 set(COMPILER_RT_SANITIZERS_TO_BUILD all CACHE STRING
     "sanitizers to build if supported on the target (all;${ALL_SANITIZERS})")
 list_replace(COMPILER_RT_SANITIZERS_TO_BUILD all "${ALL_SANITIZERS}")
@@ -843,6 +849,13 @@ else()
   set(COMPILER_RT_HAS_CTX_PROFILE FALSE)
 endif()
 
+if (COMPILER_RT_HAS_SANITIZER_COMMON AND TYSAN_SUPPORTED_ARCH AND
+        OS_NAME MATCHES "Linux|Darwin")
+  set(COMPILER_RT_HAS_TYSAN TRUE)
+else()
+  set(COMPILER_RT_HAS_TYSAN FALSE)
+endif()
+
 if (COMPILER_RT_HAS_SANITIZER_COMMON AND TSAN_SUPPORTED_ARCH)
   if (OS_NAME MATCHES "Linux|Darwin|FreeBSD|NetBSD")
     set(COMPILER_RT_HAS_TSAN TRUE)
diff --git a/compiler-rt/lib/tysan/CMakeLists.txt b/compiler-rt/lib/tysan/CMakeLists.txt
new file mode 100644
index 00000000000000..859b67928f004a
--- /dev/null
+++ b/compiler-rt/lib/tysan/CMakeLists.txt
@@ -0,0 +1,64 @@
+include_directories(..)
+
+# Runtime library sources and build flags.
+set(TYSAN_SOURCES
+  tysan.cpp
+  tysan_interceptors.cpp)
+set(TYSAN_COMMON_CFLAGS ${SANITIZER_COMMON_CFLAGS})
+append_rtti_flag(OFF TYSAN_COMMON_CFLAGS)
+# Prevent clang from generating libc calls.
+append_list_if(COMPILER_RT_HAS_FFREESTANDING_FLAG -ffreestanding TYSAN_COMMON_CFLAGS)
+
+add_compiler_rt_object_libraries(RTTysan_dynamic
+  OS ${SANITIZER_COMMON_SUPPORTED_OS}
+  ARCHS ${TYSAN_SUPPORTED_ARCH}
+  SOURCES ${TYSAN_SOURCES}
+  ADDITIONAL_HEADERS ${TYSAN_HEADERS}
+  CFLAGS ${TYSAN_DYNAMIC_CFLAGS}
+  DEFS ${TYSAN_DYNAMIC_DEFINITIONS})
+
+
+# Static runtime library.
+add_compiler_rt_component(tysan)
+
+
+if(APPLE)
+  add_weak_symbols("sanitizer_common" WEAK_SYMBOL_LINK_FLAGS)
+
+  add_compiler_rt_runtime(clang_rt.tysan
+    SHARED
+    OS ${SANITIZER_COMMON_SUPPORTED_OS}
+    ARCHS ${TYSAN_SUPPORTED_ARCH}
+    OBJECT_LIBS RTTysan_dynamic
+                RTInterception
+                RTSanitizerCommon
+                RTSanitizerCommonLibc
+                RTSanitizerCommonSymbolizer
+    CFLAGS ${TYSAN_DYNAMIC_CFLAGS}
+    LINK_FLAGS ${WEAK_SYMBOL_LINK_FLAGS}
+    DEFS ${TYSAN_DYNAMIC_DEFINITIONS}
+    PARENT_TARGET tysan)
+
+  add_compiler_rt_runtime(clang_rt.tysan_static
+    STATIC
+    ARCHS ${TYSAN_SUPPORTED_ARCH}
+    OBJECT_LIBS RTTysan_static
+    CFLAGS ${TYSAN_CFLAGS}
+    DEFS ${TYSAN_COMMON_DEFINITIONS}
+    PARENT_TARGET tysan)
+else()
+  foreach(arch ${TYSAN_SUPPORTED_ARCH})
+    set(TYSAN_CFLAGS ${TYSAN_COMMON_CFLAGS})
+    append_list_if(COMPILER_RT_HAS_FPIE_FLAG -fPIE TYSAN_CFLAGS)
+    add_compiler_rt_runtime(clang_rt.tysan
+      STATIC
+      ARCHS ${arch}
+      SOURCES ${TYSAN_SOURCES}
+              $<TARGET_OBJECTS:RTInterception.${arch}>
+              $<TARGET_OBJECTS:RTSanitizerCommon.${arch}>
+              $<TARGET_OBJECTS:RTSanitizerCommonLibc.${arch}>
+              $<TARGET_OBJECTS:RTSanitizerCommonSymbolizer.${arch}>
+      CFLAGS ${TYSAN_CFLAGS}
+      PARENT_TARGET tysan)
+  endforeach()
+endif()
diff --git a/compiler-rt/lib/tysan/lit.cfg b/compiler-rt/lib/tysan/lit.cfg
new file mode 100644
index 00000000000000..bd2bbe855529a7
--- /dev/null
+++ b/compiler-rt/lib/tysan/lit.cfg
@@ -0,0 +1,35 @@
+# -*- Python -*-
+
+import os
+
+# Setup config name.
+config.name = 'TypeSanitizer' + getattr(config, 'name_suffix', 'default')
+
+# Setup source root.
+config.test_source_root = os.path.dirname(__file__)
+
+# Setup default compiler flags used with -fsanitize=type option.
+clang_tysan_cflags = (["-fsanitize=type",
+                      "-mno-omit-leaf-frame-pointer",
+                      "-fno-omit-frame-pointer",
+                      "-fno-optimize-sibling-calls"] +
+                      [config.target_cflags] +
+                      config.debug_info_flags)
+clang_tysan_cxxflags = config.cxx_mode_flags + clang_tysan_cflags
+
+def build_invocation(compile_flags):
+  return " " + " ".join([config.clang] + compile_flags) + " "
+
+config.substitutions.append( ("%clang_tysan ", build_invocation(clang_tysan_cflags)) )
+config.substitutions.append( ("%clangxx_tysan ", build_invocation(clang_tysan_cxxflags)) )
+
+# Default test suffixes.
+config.suffixes = ['.c', '.cc', '.cpp']
+
+# TypeSanitizer tests are currently supported on Linux only.
+if config.host_os not in ['Linux']:
+  config.unsupported = True
+
+if config.target_arch != 'aarch64':
+  config.available_features.add('stable-runtime')
+
diff --git a/compiler-rt/lib/tysan/lit.site.cfg.in b/compiler-rt/lib/tysan/lit.site.cfg.in
new file mode 100644
index 00000000000000..673d04e514379b
--- /dev/null
+++ b/compiler-rt/lib/tysan/lit.site.cfg.in
@@ -0,0 +1,12 @@
+ at LIT_SITE_CFG_IN_HEADER@
+
+# Tool-specific config options.
+config.name_suffix = "@TYSAN_TEST_CONFIG_SUFFIX@"
+config.target_cflags = "@TYSAN_TEST_TARGET_CFLAGS@"
+config.target_arch = "@TYSAN_TEST_TARGET_ARCH@"
+
+# Load common config for all compiler-rt lit tests.
+lit_config.load_config(config, "@COMPILER_RT_BINARY_DIR@/test/lit.common.configured")
+
+# Load tool-specific config that would do the real work.
+lit_config.load_config(config, "@TYSAN_LIT_SOURCE_DIR@/lit.cfg")
diff --git a/compiler-rt/lib/tysan/tysan.cpp b/compiler-rt/lib/tysan/tysan.cpp
new file mode 100644
index 00000000000000..f1b6bdcf0d8261
--- /dev/null
+++ b/compiler-rt/lib/tysan/tysan.cpp
@@ -0,0 +1,344 @@
+//===-- tysan.cpp ---------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of TypeSanitizer.
+//
+// TypeSanitizer runtime.
+//===----------------------------------------------------------------------===//
+
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_flags.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "sanitizer_common/sanitizer_report_decorator.h"
+#include "sanitizer_common/sanitizer_stacktrace.h"
+#include "sanitizer_common/sanitizer_symbolizer.h"
+
+#include "tysan/tysan.h"
+
+using namespace __sanitizer;
+using namespace __tysan;
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+tysan_set_type_unknown(const void *addr, uptr size) {
+  if (tysan_inited)
+    internal_memset(shadow_for(addr), 0, size * sizeof(uptr));
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+tysan_copy_types(const void *daddr, const void *saddr, uptr size) {
+  if (tysan_inited)
+    internal_memmove(shadow_for(daddr), shadow_for(saddr), size * sizeof(uptr));
+}
+
+static const char *getDisplayName(const char *Name) {
+  if (Name[0] == '\0')
+    return "<anonymous type>";
+
+  // Clang generates tags for C++ types that demangle as typeinfo. Remove the
+  // prefix from the generated string.
+  const char TIPrefix[] = "typeinfo name for ";
+
+  const char *DName = Symbolizer::GetOrInit()->Demangle(Name);
+  if (!internal_strncmp(DName, TIPrefix, sizeof(TIPrefix) - 1))
+    DName += sizeof(TIPrefix) - 1;
+
+  return DName;
+}
+
+static void printTDName(tysan_type_descriptor *td) {
+  if (((sptr)td) <= 0) {
+    Printf("<unknown type>");
+    return;
+  }
+
+  switch (td->Tag) {
+  default:
+    DCHECK(0);
+    break;
+  case TYSAN_MEMBER_TD:
+    printTDName(td->Member.Access);
+    if (td->Member.Access != td->Member.Base) {
+      Printf(" (in ");
+      printTDName(td->Member.Base);
+      Printf(" at offset %zu)", td->Member.Offset);
+    }
+    break;
+  case TYSAN_STRUCT_TD:
+    Printf("%s", getDisplayName(
+                     (char *)(td->Struct.Members + td->Struct.MemberCount)));
+    break;
+  }
+}
+
+static tysan_type_descriptor *getRootTD(tysan_type_descriptor *TD) {
+  tysan_type_descriptor *RootTD = TD;
+
+  do {
+    RootTD = TD;
+
+    if (TD->Tag == TYSAN_STRUCT_TD) {
+      if (TD->Struct.MemberCount > 0)
+        TD = TD->Struct.Members[0].Type;
+      else
+        TD = nullptr;
+    } else if (TD->Tag == TYSAN_MEMBER_TD) {
+      TD = TD->Member.Access;
+    } else {
+      DCHECK(0);
+      break;
+    }
+  } while (TD);
+
+  return RootTD;
+}
+
+static bool isAliasingLegalUp(tysan_type_descriptor *TDA,
+                              tysan_type_descriptor *TDB, int TDAOffset) {
+  // Walk up the tree starting with TDA to see if we reach TDB.
+  uptr OffsetA = 0, OffsetB = 0;
+  if (TDB->Tag == TYSAN_MEMBER_TD) {
+    OffsetB = TDB->Member.Offset;
+    TDB = TDB->Member.Base;
+  }
+
+  if (TDA->Tag == TYSAN_MEMBER_TD) {
+    OffsetA = TDA->Member.Offset - TDAOffset;
+    TDA = TDA->Member.Base;
+  }
+
+  do {
+    if (TDA == TDB) {
+      return OffsetA == OffsetB;
+    }
+
+    if (TDA->Tag == TYSAN_STRUCT_TD) {
+      // Reached root type descriptor.
+      if (!TDA->Struct.MemberCount)
+        break;
+
+      uptr Idx = 0;
+      for (; Idx < TDA->Struct.MemberCount - 1; ++Idx) {
+        if (TDA->Struct.Members[Idx].Offset >= OffsetA)
+          break;
+      }
+
+      OffsetA -= TDA->Struct.Members[Idx].Offset;
+      TDA = TDA->Struct.Members[Idx].Type;
+    } else {
+      DCHECK(0);
+      break;
+    }
+  } while (TDA);
+
+  return false;
+}
+
+static bool isAliasingLegal(tysan_type_descriptor *TDA,
+                            tysan_type_descriptor *TDB, int TDAOffset = 0) {
+  if (TDA == TDB || !TDB || !TDA)
+    return true;
+
+  // Aliasing is legal is the two types have different root nodes.
+  if (getRootTD(TDA) != getRootTD(TDB))
+    return true;
+
+  // TDB may have been adjusted by offset TDAOffset in the caller to point to
+  // the outer type. Check for aliasing with and without adjusting for this
+  // offset.
+  return isAliasingLegalUp(TDA, TDB, 0) || isAliasingLegalUp(TDB, TDA, 0) ||
+         isAliasingLegalUp(TDA, TDB, TDAOffset);
+}
+
+namespace __tysan {
+class Decorator : public __sanitizer::SanitizerCommonDecorator {
+public:
+  Decorator() : SanitizerCommonDecorator() {}
+  const char *Warning() { return Red(); }
+  const char *Name() { return Green(); }
+  const char *End() { return Default(); }
+};
+} // namespace __tysan
+
+ALWAYS_INLINE
+static void reportError(void *Addr, int Size, tysan_type_descriptor *TD,
+                        tysan_type_descriptor *OldTD, const char *AccessStr,
+                        const char *DescStr, int Offset, uptr pc, uptr bp,
+                        uptr sp) {
+  Decorator d;
+  Printf("%s", d.Warning());
+  Report("ERROR: TypeSanitizer: type-aliasing-violation on address %p"
+         " (pc %p bp %p sp %p tid %llu)\n",
+         Addr, (void *)pc, (void *)bp, (void *)sp, GetTid());
+  Printf("%s", d.End());
+  Printf("%s of size %d at %p with type ", AccessStr, Size, Addr);
+
+  Printf("%s", d.Name());
+  printTDName(TD);
+  Printf("%s", d.End());
+
+  Printf(" %s of type ", DescStr);
+
+  Printf("%s", d.Name());
+  printTDName(OldTD);
+  Printf("%s", d.End());
+
+  if (Offset != 0)
+    Printf(" that starts at offset %d\n", Offset);
+  else
+    Printf("\n");
+
+  if (pc) {
+
+    bool request_fast = StackTrace::WillUseFastUnwind(true);
+    BufferedStackTrace ST;
+    ST.Unwind(kStackTraceMax, pc, bp, 0, 0, 0, request_fast);
+    ST.Print();
+  } else {
+    Printf("\n");
+  }
+}
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
+__tysan_check(void *addr, int size, tysan_type_descriptor *td, int flags) {
+  GET_CALLER_PC_BP_SP;
+
+  bool IsRead = flags & 1;
+  bool IsWrite = flags & 2;
+  const char *AccessStr;
+  if (IsRead && !IsWrite)
+    AccessStr = "READ";
+  else if (!IsRead && IsWrite)
+    AccessStr = "WRITE";
+  else
+    AccessStr = "ATOMIC UPDATE";
+
+  tysan_type_descriptor **OldTDPtr = shadow_for(addr);
+  tysan_type_descriptor *OldTD = *OldTDPtr;
+  if (((sptr)OldTD) < 0) {
+    int i = -((sptr)OldTD);
+    OldTDPtr -= i;
+    OldTD = *OldTDPtr;
+
+    if (!isAliasingLegal(td, OldTD, i))
+      reportError(addr, size, td, OldTD, AccessStr,
+                  "accesses part of an existing object", -i, pc, bp, sp);
+
+    return;
+  }
+
+  if (!isAliasingLegal(td, OldTD)) {
+    reportError(addr, size, td, OldTD, AccessStr, "accesses an existing object",
+                0, pc, bp, sp);
+    return;
+  }
+
+  // These types are allowed to alias (or the stored type is unknown), report
+  // an error if we find an interior type.
+
+  for (int i = 0; i < size; ++i) {
+    OldTDPtr = shadow_for((void *)(((uptr)addr) + i));
+    OldTD = *OldTDPtr;
+    if (((sptr)OldTD) >= 0 && !isAliasingLegal(td, OldTD))
+      reportError(addr, size, td, OldTD, AccessStr,
+                  "partially accesses an object", i, pc, bp, sp);
+  }
+}
+
+Flags __tysan::flags_data;
+
+SANITIZER_INTERFACE_ATTRIBUTE uptr __tysan_shadow_memory_address;
+SANITIZER_INTERFACE_ATTRIBUTE uptr __tysan_app_memory_mask;
+
+#ifdef TYSAN_RUNTIME_VMA
+// Runtime detected VMA size.
+int __tysan::vmaSize;
+#endif
+
+void Flags::SetDefaults() {
+#define TYSAN_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "tysan_flags.inc"
+#undef TYSAN_FLAG
+}
+
+static void RegisterTySanFlags(FlagParser *parser, Flags *f) {
+#define TYSAN_FLAG(Type, Name, DefaultValue, Description)                      \
+  RegisterFlag(parser, #Name, Description, &f->Name);
+#include "tysan_flags.inc"
+#undef TYSAN_FLAG
+}
+
+static void InitializeFlags() {
+  SetCommonFlagsDefaults();
+  {
+    CommonFlags cf;
+    cf.CopyFrom(*common_flags());
+    cf.external_symbolizer_path = GetEnv("TYSAN_SYMBOLIZER_PATH");
+    OverrideCommonFlags(cf);
+  }
+
+  flags().SetDefaults();
+
+  FlagParser parser;
+  RegisterCommonFlags(&parser);
+  RegisterTySanFlags(&parser, &flags());
+  parser.ParseString(GetEnv("TYSAN_OPTIONS"));
+  InitializeCommonFlags();
+  if (Verbosity())
+    ReportUnrecognizedFlags();
+  if (common_flags()->help)
+    parser.PrintFlagDescriptions();
+}
+
+static void TySanInitializePlatformEarly() {
+  AvoidCVE_2016_2143();
+#ifdef TYSAN_RUNTIME_VMA
+  vmaSize = (MostSignificantSetBitIndex(GET_CURRENT_FRAME()) + 1);
+#if defined(__aarch64__) && !SANITIZER_APPLE
+  if (vmaSize != 39 && vmaSize != 42 && vmaSize != 48) {
+    Printf("FATAL: TypeSanitizer: unsupported VMA range\n");
+    Printf("FATAL: Found %d - Supported 39, 42 and 48\n", vmaSize);
+    Die();
+  }
+#endif
+#endif
+
+  __sanitizer::InitializePlatformEarly();
+
+  __tysan_shadow_memory_address = ShadowAddr();
+  __tysan_app_memory_mask = AppMask();
+}
+
+namespace __tysan {
+bool tysan_inited = false;
+bool tysan_init_is_running;
+} // namespace __tysan
+
+extern "C" SANITIZER_INTERFACE_ATTRIBUTE void __tysan_init() {
+  CHECK(!tysan_init_is_running);
+  if (tysan_inited)
+    return;
+  tysan_init_is_running = true;
+
+  InitializeFlags();
+  TySanInitializePlatformEarly();
+
+  InitializeInterceptors();
+
+  if (!MmapFixedNoReserve(ShadowAddr(), AppAddr() - ShadowAddr()))
+    Die();
+
+  tysan_init_is_running = false;
+  tysan_inited = true;
+}
+
+#if SANITIZER_CAN_USE_PREINIT_ARRAY
+__attribute__((section(".preinit_array"),
+               used)) static void (*tysan_init_ptr)() = __tysan_init;
+#endif
diff --git a/compiler-rt/lib/tysan/tysan.h b/compiler-rt/lib/tysan/tysan.h
new file mode 100644
index 00000000000000..ec6f9587e9ce58
--- /dev/null
+++ b/compiler-rt/lib/tysan/tysan.h
@@ -0,0 +1,79 @@
+//===-- tysan.h -------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of TypeSanitizer.
+//
+// Private TySan header.
+//===----------------------------------------------------------------------===//
+
+#ifndef TYSAN_H
+#define TYSAN_H
+
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+using __sanitizer::sptr;
+using __sanitizer::u16;
+using __sanitizer::uptr;
+
+#include "tysan_platform.h"
+
+extern "C" {
+void tysan_set_type_unknown(const void *addr, uptr size);
+void tysan_copy_types(const void *daddr, const void *saddr, uptr size);
+}
+
+namespace __tysan {
+extern bool tysan_inited;
+extern bool tysan_init_is_running;
+
+void InitializeInterceptors();
+
+enum { TYSAN_MEMBER_TD = 1, TYSAN_STRUCT_TD = 2 };
+
+struct tysan_member_type_descriptor {
+  struct tysan_type_descriptor *Base;
+  struct tysan_type_descriptor *Access;
+  uptr Offset;
+};
+
+struct tysan_struct_type_descriptor {
+  uptr MemberCount;
+  struct {
+    struct tysan_type_descriptor *Type;
+    uptr Offset;
+  } Members[1]; // Tail allocated.
+  // char Name[]; // Tail allocated.
+};
+
+struct tysan_type_descriptor {
+  uptr Tag;
+  union {
+    tysan_member_type_descriptor Member;
+    tysan_struct_type_descriptor Struct;
+  };
+};
+
+inline tysan_type_descriptor **shadow_for(const void *ptr) {
+  return (tysan_type_descriptor **)((((uptr)ptr) & AppMask()) * sizeof(ptr) +
+                                    ShadowAddr());
+}
+
+struct Flags {
+#define TYSAN_FLAG(Type, Name, DefaultValue, Description) Type Name;
+#include "tysan_flags.inc"
+#undef TYSAN_FLAG
+
+  void SetDefaults();
+};
+
+extern Flags flags_data;
+inline Flags &flags() { return flags_data; }
+
+} // namespace __tysan
+
+#endif // TYSAN_H
diff --git a/compiler-rt/lib/tysan/tysan.syms.extra b/compiler-rt/lib/tysan/tysan.syms.extra
new file mode 100644
index 00000000000000..04e78543161998
--- /dev/null
+++ b/compiler-rt/lib/tysan/tysan.syms.extra
@@ -0,0 +1,2 @@
+tysan_*
+__tysan_*
diff --git a/compiler-rt/lib/tysan/tysan_flags.inc b/compiler-rt/lib/tysan/tysan_flags.inc
new file mode 100644
index 00000000000000..98b6591f844ef0
--- /dev/null
+++ b/compiler-rt/lib/tysan/tysan_flags.inc
@@ -0,0 +1,17 @@
+//===-- tysan_flags.inc ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// TySan runtime flags.
+//
+//===----------------------------------------------------------------------===//
+#ifndef TYSAN_FLAG
+#error "Define TYSAN_FLAG prior to including this file!"
+#endif
+
+// TYSAN_FLAG(Type, Name, DefaultValue, Description)
+// See COMMON_FLAG in sanitizer_flags.inc for more details.
diff --git a/compiler-rt/lib/tysan/tysan_interceptors.cpp b/compiler-rt/lib/tysan/tysan_interceptors.cpp
new file mode 100644
index 00000000000000..5fc6f244122727
--- /dev/null
+++ b/compiler-rt/lib/tysan/tysan_interceptors.cpp
@@ -0,0 +1,250 @@
+//===-- tysan_interceptors.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of TypeSanitizer.
+//
+// Interceptors for standard library functions.
+//===----------------------------------------------------------------------===//
+
+#include "interception/interception.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "tysan/tysan.h"
+
+#if SANITIZER_LINUX && !SANITIZER_ANDROID
+#define TYSAN_INTERCEPT___STRDUP 1
+#else
+#define TYSAN_INTERCEPT___STRDUP 0
+#endif
+
+#if SANITIZER_LINUX
+extern "C" int mallopt(int param, int value);
+#endif
+
+using namespace __sanitizer;
+using namespace __tysan;
+
+static const uptr early_alloc_buf_size = 16384;
+static uptr allocated_bytes;
+static char early_alloc_buf[early_alloc_buf_size];
+
+static bool isInEarlyAllocBuf(const void *ptr) {
+  return ((uptr)ptr >= (uptr)early_alloc_buf &&
+          ((uptr)ptr - (uptr)early_alloc_buf) < sizeof(early_alloc_buf));
+}
+
+// Handle allocation requests early (before all interceptors are setup). dlsym,
+// for example, calls calloc.
+static void *handleEarlyAlloc(uptr size) {
+  void *mem = (void *)&early_alloc_buf[allocated_bytes];
+  allocated_bytes += size;
+  CHECK_LT(allocated_bytes, early_alloc_buf_size);
+  return mem;
+}
+
+INTERCEPTOR(void *, memset, void *dst, int v, uptr size) {
+  if (!tysan_inited && REAL(memset) == nullptr)
+    return internal_memset(dst, v, size);
+
+  void *res = REAL(memset)(dst, v, size);
+  tysan_set_type_unknown(dst, size);
+  return res;
+}
+
+INTERCEPTOR(void *, memmove, void *dst, const void *src, uptr size) {
+  if (!tysan_inited && REAL(memmove) == nullptr)
+    return internal_memmove(dst, src, size);
+
+  void *res = REAL(memmove)(dst, src, size);
+  tysan_copy_types(dst, src, size);
+  return res;
+}
+
+INTERCEPTOR(void *, memcpy, void *dst, const void *src, uptr size) {
+  if (!tysan_inited && REAL(memcpy) == nullptr) {
+    // memmove is used here because on some platforms this will also
+    // intercept the memmove implementation.
+    return internal_memmove(dst, src, size);
+  }
+
+  void *res = REAL(memcpy)(dst, src, size);
+  tysan_copy_types(dst, src, size);
+  return res;
+}
+
+INTERCEPTOR(void *, mmap, void *addr, SIZE_T length, int prot, int flags,
+            int fd, OFF_T offset) {
+  void *res = REAL(mmap)(addr, length, prot, flags, fd, offset);
+  if (res != (void *)-1)
+    tysan_set_type_unknown(res, RoundUpTo(length, GetPageSize()));
+  return res;
+}
+
+#if !SANITIZER_APPLE
+INTERCEPTOR(void *, mmap64, void *addr, SIZE_T length, int prot, int flags,
+            int fd, OFF64_T offset) {
+  void *res = REAL(mmap64)(addr, length, prot, flags, fd, offset);
+  if (res != (void *)-1)
+    tysan_set_type_unknown(res, RoundUpTo(length, GetPageSize()));
+  return res;
+}
+#endif
+
+INTERCEPTOR(char *, strdup, const char *s) {
+  char *res = REAL(strdup)(s);
+  if (res)
+    tysan_copy_types(res, const_cast<char *>(s), internal_strlen(s));
+  return res;
+}
+
+#if TYSAN_INTERCEPT___STRDUP
+INTERCEPTOR(char *, __strdup, const char *s) {
+  char *res = REAL(__strdup)(s);
+  if (res)
+    tysan_copy_types(res, const_cast<char *>(s), internal_strlen(s));
+  return res;
+}
+#endif // TYSAN_INTERCEPT___STRDUP
+
+INTERCEPTOR(void *, malloc, uptr size) {
+  if (tysan_init_is_running && REAL(malloc) == nullptr)
+    return handleEarlyAlloc(size);
+
+  void *res = REAL(malloc)(size);
+  if (res)
+    tysan_set_type_unknown(res, size);
+  return res;
+}
+
+INTERCEPTOR(void *, realloc, void *ptr, uptr size) {
+  void *res = REAL(realloc)(ptr, size);
+  // We might want to copy the types from the original allocation (although
+  // that would require that we knew its size).
+  if (res)
+    tysan_set_type_unknown(res, size);
+  return res;
+}
+
+INTERCEPTOR(void *, calloc, uptr nmemb, uptr size) {
+  if (tysan_init_is_running && REAL(calloc) == nullptr)
+    return handleEarlyAlloc(nmemb * size);
+
+  void *res = REAL(calloc)(nmemb, size);
+  if (res)
+    tysan_set_type_unknown(res, nmemb * size);
+  return res;
+}
+
+INTERCEPTOR(void, free, void *p) {
+  // There are only a few early allocation requests,
+  // so we simply skip the free.
+  if (isInEarlyAllocBuf(p))
+    return;
+  REAL(free)(p);
+}
+
+INTERCEPTOR(void *, valloc, uptr size) {
+  void *res = REAL(valloc)(size);
+  if (res)
+    tysan_set_type_unknown(res, size);
+  return res;
+}
+
+#if SANITIZER_INTERCEPT_MEMALIGN
+INTERCEPTOR(void *, memalign, uptr alignment, uptr size) {
+  void *res = REAL(memalign)(alignment, size);
+  if (res)
+    tysan_set_type_unknown(res, size);
+  return res;
+}
+#define TYSAN_MAYBE_INTERCEPT_MEMALIGN INTERCEPT_FUNCTION(memalign)
+#else
+#define TYSAN_MAYBE_INTERCEPT_MEMALIGN
+#endif // SANITIZER_INTERCEPT_MEMALIGN
+
+#if SANITIZER_INTERCEPT___LIBC_MEMALIGN
+INTERCEPTOR(void *, __libc_memalign, uptr alignment, uptr size) {
+  void *res = REAL(__libc_memalign)(alignment, size);
+  if (res)
+    tysan_set_type_unknown(res, size);
+  return res;
+}
+#define TYSAN_MAYBE_INTERCEPT___LIBC_MEMALIGN                                  \
+  INTERCEPT_FUNCTION(__libc_memalign)
+#else
+#define TYSAN_MAYBE_INTERCEPT___LIBC_MEMALIGN
+#endif // SANITIZER_INTERCEPT___LIBC_MEMALIGN
+
+#if SANITIZER_INTERCEPT_PVALLOC
+INTERCEPTOR(void *, pvalloc, uptr size) {
+  void *res = REAL(pvalloc)(size);
+  if (res)
+    tysan_set_type_unknown(res, size);
+  return res;
+}
+#define TYSAN_MAYBE_INTERCEPT_PVALLOC INTERCEPT_FUNCTION(pvalloc)
+#else
+#define TYSAN_MAYBE_INTERCEPT_PVALLOC
+#endif // SANITIZER_INTERCEPT_PVALLOC
+
+#if SANITIZER_INTERCEPT_ALIGNED_ALLOC
+INTERCEPTOR(void *, aligned_alloc, uptr alignment, uptr size) {
+  void *res = REAL(aligned_alloc)(alignment, size);
+  if (res)
+    tysan_set_type_unknown(res, size);
+  return res;
+}
+#define TYSAN_MAYBE_INTERCEPT_ALIGNED_ALLOC INTERCEPT_FUNCTION(aligned_alloc)
+#else
+#define TYSAN_MAYBE_INTERCEPT_ALIGNED_ALLOC
+#endif
+
+INTERCEPTOR(int, posix_memalign, void **memptr, uptr alignment, uptr size) {
+  int res = REAL(posix_memalign)(memptr, alignment, size);
+  if (res == 0 && *memptr)
+    tysan_set_type_unknown(*memptr, size);
+  return res;
+}
+
+namespace __tysan {
+void InitializeInterceptors() {
+  static int inited = 0;
+  CHECK_EQ(inited, 0);
+
+  // Instruct libc malloc to consume less memory.
+#if SANITIZER_LINUX
+  mallopt(1, 0);          // M_MXFAST
+  mallopt(-3, 32 * 1024); // M_MMAP_THRESHOLD
+#endif
+
+  INTERCEPT_FUNCTION(mmap);
+
+  INTERCEPT_FUNCTION(mmap64);
+
+  INTERCEPT_FUNCTION(strdup);
+#if TYSAN_INTERCEPT___STRDUP
+  INTERCEPT_FUNCTION(__strdup);
+#endif
+
+  INTERCEPT_FUNCTION(malloc);
+  INTERCEPT_FUNCTION(calloc);
+  INTERCEPT_FUNCTION(free);
+  INTERCEPT_FUNCTION(realloc);
+  INTERCEPT_FUNCTION(valloc);
+  TYSAN_MAYBE_INTERCEPT_MEMALIGN;
+  TYSAN_MAYBE_INTERCEPT___LIBC_MEMALIGN;
+  TYSAN_MAYBE_INTERCEPT_PVALLOC;
+  TYSAN_MAYBE_INTERCEPT_ALIGNED_ALLOC
+  INTERCEPT_FUNCTION(posix_memalign);
+
+  INTERCEPT_FUNCTION(memset);
+  INTERCEPT_FUNCTION(memmove);
+  INTERCEPT_FUNCTION(memcpy);
+
+  inited = 1;
+}
+} // namespace __tysan
diff --git a/compiler-rt/lib/tysan/tysan_platform.h b/compiler-rt/lib/tysan/tysan_platform.h
new file mode 100644
index 00000000000000..f01392885d9398
--- /dev/null
+++ b/compiler-rt/lib/tysan/tysan_platform.h
@@ -0,0 +1,93 @@
+//===------------------------ tysan_platform.h ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of TypeSanitizer.
+//
+// Platform specific information for TySan.
+//===----------------------------------------------------------------------===//
+
+#ifndef TYSAN_PLATFORM_H
+#define TYSAN_PLATFORM_H
+
+namespace __tysan {
+
+#if defined(__x86_64__) || SANITIZER_APPLE
+struct Mapping {
+  static const uptr kShadowAddr = 0x010000000000ull;
+  static const uptr kAppAddr = 0x550000000000ull;
+  static const uptr kAppMemMsk = ~0x780000000000ull;
+};
+#elif defined(__aarch64__)
+struct Mapping39 {
+  static const uptr kShadowAddr = 0x0800000000ull;
+  static const uptr kAppAddr = 0x5500000000ull;
+  static const uptr kAppMemMsk = ~0x7800000000ull;
+};
+
+struct Mapping42 {
+  static const uptr kShadowAddr = 0x10000000000ull;
+  static const uptr kAppAddr = 0x2aa00000000ull;
+  static const uptr kAppMemMsk = ~0x3c000000000ull;
+};
+
+struct Mapping48 {
+  static const uptr kShadowAddr = 0x0002000000000ull;
+  static const uptr kAppAddr = 0x0aaaa00000000ull;
+  static const uptr kAppMemMsk = ~0x0fff800000000ull;
+};
+#define TYSAN_RUNTIME_VMA 1
+#else
+#error "TySan not supported for this platform!"
+#endif
+
+#if TYSAN_RUNTIME_VMA
+extern int vmaSize;
+#endif
+
+enum MappingType { MAPPING_SHADOW_ADDR, MAPPING_APP_ADDR, MAPPING_APP_MASK };
+
+template <typename Mapping, int Type> uptr MappingImpl(void) {
+  switch (Type) {
+  case MAPPING_SHADOW_ADDR:
+    return Mapping::kShadowAddr;
+  case MAPPING_APP_ADDR:
+    return Mapping::kAppAddr;
+  case MAPPING_APP_MASK:
+    return Mapping::kAppMemMsk;
+  }
+}
+
+template <int Type> uptr MappingArchImpl(void) {
+#if defined(__aarch64__) && !SANITIZER_APPLE
+  switch (vmaSize) {
+  case 39:
+    return MappingImpl<Mapping39, Type>();
+  case 42:
+    return MappingImpl<Mapping42, Type>();
+  case 48:
+    return MappingImpl<Mapping48, Type>();
+  }
+  DCHECK(0);
+  return 0;
+#else
+  return MappingImpl<Mapping, Type>();
+#endif
+}
+
+ALWAYS_INLINE
+uptr ShadowAddr() { return MappingArchImpl<MAPPING_SHADOW_ADDR>(); }
+
+ALWAYS_INLINE
+uptr AppAddr() { return MappingArchImpl<MAPPING_APP_ADDR>(); }
+
+ALWAYS_INLINE
+uptr AppMask() { return MappingArchImpl<MAPPING_APP_MASK>(); }
+
+} // namespace __tysan
+
+#endif
diff --git a/compiler-rt/test/tysan/CMakeLists.txt b/compiler-rt/test/tysan/CMakeLists.txt
new file mode 100644
index 00000000000000..76f57501e854e6
--- /dev/null
+++ b/compiler-rt/test/tysan/CMakeLists.txt
@@ -0,0 +1,32 @@
+set(TYSAN_LIT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+
+set(TYSAN_TESTSUITES)
+
+set(TYSAN_TEST_ARCH ${TYSAN_SUPPORTED_ARCH})
+if(APPLE)
+  darwin_filter_host_archs(TYSAN_SUPPORTED_ARCH TYSAN_TEST_ARCH)
+endif()
+
+foreach(arch ${TYSAN_TEST_ARCH})
+  set(TYSAN_TEST_TARGET_ARCH ${arch})
+  string(TOLOWER "-${arch}" TYSAN_TEST_CONFIG_SUFFIX)
+  get_test_cc_for_arch(${arch} TYSAN_TEST_TARGET_CC TYSAN_TEST_TARGET_CFLAGS)
+  string(TOUPPER ${arch} ARCH_UPPER_CASE)
+  set(CONFIG_NAME ${ARCH_UPPER_CASE}Config)
+
+  configure_lit_site_cfg(
+    ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+    ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}/lit.site.cfg.py)
+  list(APPEND TYSAN_TESTSUITES ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME})
+endforeach()
+
+set(TYSAN_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
+if(NOT COMPILER_RT_STANDALONE_BUILD)
+  list(APPEND TYSAN_TEST_DEPS tysan)
+endif()
+
+add_lit_testsuite(check-tysan "Running the TypeSanitizer tests"
+  ${TYSAN_TESTSUITES}
+  DEPENDS ${TYSAN_TEST_DEPS}
+  )
+set_target_properties(check-tysan PROPERTIES FOLDER "Compiler-RT Misc")
diff --git a/compiler-rt/test/tysan/anon-ns.cpp b/compiler-rt/test/tysan/anon-ns.cpp
new file mode 100644
index 00000000000000..681304411df315
--- /dev/null
+++ b/compiler-rt/test/tysan/anon-ns.cpp
@@ -0,0 +1,41 @@
+// RUN: %clangxx_tysan -O0 %s -c -o %t.o
+// RUN: %clangxx_tysan -O0 %s -DPMAIN -c -o %tm.o
+// RUN: %clangxx_tysan -O0 %t.o %tm.o -o %t
+// RUN: %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+#include <iostream>
+
+// This test demonstrates that the types from anonymous namespaces are
+// different in different translation units (while the char* type is the same).
+
+namespace {
+struct X {
+  X(int i, int j) : a(i), b(j) {}
+  int a;
+  int b;
+};
+} // namespace
+
+#ifdef PMAIN
+void foo(void *context, int i);
+char fbyte(void *context);
+
+int main() {
+  X x(5, 6);
+  foo((void *)&x, 8);
+  std::cout << "fbyte: " << fbyte((void *)&x) << "\n";
+}
+#else
+void foo(void *context, int i) {
+  X *x = (X *)context;
+  x->b = i;
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: WRITE of size 4 at {{.*}} with type int (in (anonymous namespace)::X at offset 4) accesses an existing object of type int (in (anonymous namespace)::X at offset 4)
+  // CHECK: {{#0 0x.* in foo\(void\*, int\) .*anon-ns.cpp:}}[[@LINE-3]]
+}
+
+char fbyte(void *context) { return *(char *)context; }
+#endif
+
+// CHECK-NOT: ERROR: TypeSanitizer: type-aliasing-violation
diff --git a/compiler-rt/test/tysan/anon-same-struct.c b/compiler-rt/test/tysan/anon-same-struct.c
new file mode 100644
index 00000000000000..b9044f2a0a73c8
--- /dev/null
+++ b/compiler-rt/test/tysan/anon-same-struct.c
@@ -0,0 +1,26 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+#include <stdio.h>
+
+// The two anonymous structs are structurally identical. As a result, we don't
+// report an aliasing violation here.
+// CHECK-NOT: ERROR: TypeSanitizer: type-aliasing-violation
+
+typedef struct {
+  int i1;
+} s1;
+typedef struct {
+  int i2;
+} s2;
+
+void f(s1 *s1p, s2 *s2p) {
+  s1p->i1 = 2;
+  s2p->i2 = 3;
+  printf("%i\n", s1p->i1);
+}
+
+int main() {
+  s1 s = {.i1 = 1};
+  f(&s, (s2 *)&s);
+}
diff --git a/compiler-rt/test/tysan/anon-struct.c b/compiler-rt/test/tysan/anon-struct.c
new file mode 100644
index 00000000000000..25f6633545928c
--- /dev/null
+++ b/compiler-rt/test/tysan/anon-struct.c
@@ -0,0 +1,27 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+#include <stdio.h>
+
+typedef struct {
+  int i1, i1b;
+} s1;
+typedef struct {
+  int i2, i2b, i2c;
+} s2;
+
+void f(s1 *s1p, s2 *s2p) {
+  s1p->i1 = 2;
+  s2p->i2 = 3;
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: WRITE of size 4 at {{.*}} with type int (in <anonymous type> at offset 0) accesses an existing object of type int (in <anonymous type> at offset 0)
+  // CHECK: {{#0 0x.* in f .*anon-struct.c:}}[[@LINE-3]]
+  printf("%i\n", s1p->i1);
+}
+
+int main() {
+  s1 s = {.i1 = 1, .i1b = 5};
+  f(&s, (s2 *)&s);
+}
+
+// CHECK-NOT: ERROR: TypeSanitizer: type-aliasing-violation
diff --git a/compiler-rt/test/tysan/basic.c b/compiler-rt/test/tysan/basic.c
new file mode 100644
index 00000000000000..8e66e1a7213838
--- /dev/null
+++ b/compiler-rt/test/tysan/basic.c
@@ -0,0 +1,65 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t 10 >%t.out.0 2>&1
+// RUN: FileCheck %s < %t.out.0
+// RUN: %clang_tysan -O2 %s -o %t && %run %t 10 >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+void __attribute__((noinline)) add_flt(float *a) {
+  *a += 2.0f;
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: READ of size 4 at {{.*}} with type float accesses an existing object of type int
+  // CHECK: {{#0 0x.* in add_flt .*basic.c:}}[[@LINE-3]]
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: WRITE of size 4 at {{.*}} with type float accesses an existing object of type int
+  // CHECK: {{#0 0x.* in add_flt .*basic.c:}}[[@LINE-6]]
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: READ of size 4 at {{.*}} with type float accesses an existing object of type long
+  // CHECK: {{#0 0x.* in add_flt .*basic.c:}}[[@LINE-9]]
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: WRITE of size 4 at {{.*}} with type float accesses an existing object of type long
+  // CHECK: {{#0 0x.* in add_flt .*basic.c:}}[[@LINE-12]]
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: READ of size 4 at {{.*}} with type float accesses part of an existing object of type long that starts at offset -4
+  // CHECK: {{#0 0x.* in add_flt .*basic.c:}}[[@LINE-15]]
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: WRITE of size 4 at {{.*}} with type float accesses part of an existing object of type long that starts at offset -4
+  // CHECK: {{#0 0x.* in add_flt .*basic.c:}}[[@LINE-18]]
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: READ of size 4 at {{.*}} with type float partially accesses an object of type short that starts at offset 2
+  // CHECK: {{#0 0x.* in add_flt .*basic.c:}}[[@LINE-21]]
+}
+
+int main(int argc, char *argv[]) {
+  int x = atoi(argv[1]);
+  add_flt((float *)&x);
+  printf("x = %d\n", x);
+
+  long y = x;
+  add_flt((float *)&y);
+  printf("y = %ld\n", y);
+
+  add_flt(((float *)&y) + 1);
+  printf("y = %ld\n", y);
+
+  char *mem = (char *)malloc(4 * sizeof(short));
+  memset(mem, 0, 4 * sizeof(short));
+  *(short *)(mem + 2) = x;
+  add_flt((float *)mem);
+  short s1 = *(short *)mem;
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: READ of size 2 at {{.*}} with type short accesses an existing object of type float
+  // CHECK: {{#0 0x.* in main .*basic.c:}}[[@LINE-3]]
+  short s2 = *(short *)(mem + 2);
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: READ of size 2 at {{.*}} with type short accesses part of an existing object of type float that starts at offset -2
+  // CHECK: {{#0 0x.* in main .*basic.c:}}[[@LINE-3]]
+  printf("m[0] = %d, m[1] = %d\n", s1, s2);
+  free(mem);
+
+  return 0;
+}
+
+// CHECK-NOT: ERROR: TypeSanitizer: type-aliasing-violation
diff --git a/compiler-rt/test/tysan/char-memcpy.c b/compiler-rt/test/tysan/char-memcpy.c
new file mode 100644
index 00000000000000..ebbb6b53d0f374
--- /dev/null
+++ b/compiler-rt/test/tysan/char-memcpy.c
@@ -0,0 +1,45 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out.0 2>&1
+// RUN: FileCheck %s < %t.out.0
+// RUN: %clang_tysan -O2 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+#include <stdio.h>
+
+// There's no type-based-aliasing violation here: the memcpy is implemented
+// using only char* or unsigned char* (both of which may alias anything).
+// CHECK-NOT: ERROR: TypeSanitizer: type-aliasing-violation
+
+void my_memcpy_uchar(void *dest, void *src, int n) {
+  unsigned char *p = dest, *q = src, *end = p + n;
+  while (p < end)
+    *p++ = *q++;
+}
+
+void my_memcpy_char(void *dest, void *src, int n) {
+  char *p = dest, *q = src, *end = p + n;
+  while (p < end)
+    *p++ = *q++;
+}
+
+void test_uchar() {
+  struct S {
+    short x;
+    short *r;
+  } s = {10, &s.x}, s2;
+  my_memcpy_uchar(&s2, &s, sizeof(struct S));
+  printf("%d\n", *(s2.r));
+}
+
+void test_char() {
+  struct S {
+    short x;
+    short *r;
+  } s = {10, &s.x}, s2;
+  my_memcpy_char(&s2, &s, sizeof(struct S));
+  printf("%d\n", *(s2.r));
+}
+
+int main() {
+  test_uchar();
+  test_char();
+}
diff --git a/compiler-rt/test/tysan/constexpr-subobject.cpp b/compiler-rt/test/tysan/constexpr-subobject.cpp
new file mode 100644
index 00000000000000..9cae310554c9b4
--- /dev/null
+++ b/compiler-rt/test/tysan/constexpr-subobject.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// CHECK-NOT: TypeSanitizer
+
+int foo() { return 0; }
+
+struct Bar {
+  struct S2 {
+    int (*fnA)();
+    int (*fnB)();
+  };
+
+  static int x() { return 0; }
+
+  static const S2 &get() {
+    static constexpr S2 Info = {&foo, &Bar::x};
+    return Info;
+  }
+};
+
+int main() {
+  auto Info = Bar::get();
+  return Info.fnB();
+}
diff --git a/compiler-rt/test/tysan/global.c b/compiler-rt/test/tysan/global.c
new file mode 100644
index 00000000000000..247ee768a81626
--- /dev/null
+++ b/compiler-rt/test/tysan/global.c
@@ -0,0 +1,31 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+#include <stdlib.h>
+#include <string.h>
+
+float P;
+long L;
+
+int main() {
+  *(int *)&P = 5;
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: WRITE of size 4 at {{.*}} with type int accesses an existing object of type float
+  // CHECK: {{#0 0x.* in main .*global.c:}}[[@LINE-3]]
+
+  void *mem = malloc(sizeof(long));
+  *(int *)mem = 6;
+  memcpy(mem, &L, sizeof(L));
+  *(int *)mem = 8;
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: WRITE of size 4 at {{.*}} with type int accesses an existing object of type long
+  // CHECK: {{#0 0x.* in main .*global.c:}}[[@LINE-3]]
+  int r = *(((int *)mem) + 1);
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: READ of size 4 at {{.*}} with type int accesses part of an existing object of type long that starts at offset -4
+  // CHECK: {{#0 0x.* in main .*global.c:}}[[@LINE-3]]
+  free(mem);
+
+  return r;
+}
+
+// CHECK-NOT: ERROR: TypeSanitizer: type-aliasing-violation
diff --git a/compiler-rt/test/tysan/int-long.c b/compiler-rt/test/tysan/int-long.c
new file mode 100644
index 00000000000000..b7956c07376e8e
--- /dev/null
+++ b/compiler-rt/test/tysan/int-long.c
@@ -0,0 +1,21 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+#include <stdio.h>
+
+long foo(int *x, long *y) {
+  *x = 0;
+  *y = 1;
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: WRITE of size 8 at {{.*}} with type long accesses an existing object of type int
+  // CHECK: {{#0 0x.* in foo .*int-long.c:}}[[@LINE-3]]
+
+  return *x;
+}
+
+int main(void) {
+  long l;
+  printf("%ld\n", foo((int *)&l, &l));
+}
+
+// CHECK-NOT: ERROR: TypeSanitizer: type-aliasing-violation
diff --git a/compiler-rt/test/tysan/lit.cfg.py b/compiler-rt/test/tysan/lit.cfg.py
new file mode 100644
index 00000000000000..05c8f0664d5e65
--- /dev/null
+++ b/compiler-rt/test/tysan/lit.cfg.py
@@ -0,0 +1,139 @@
+# -*- Python -*-
+
+import os
+import platform
+import re
+
+import lit.formats
+
+# Get shlex.quote if available (added in 3.3), and fall back to pipes.quote if
+# it's not available.
+try:
+  import shlex
+  sh_quote = shlex.quote
+except:
+  import pipes
+  sh_quote = pipes.quote
+
+def get_required_attr(config, attr_name):
+  attr_value = getattr(config, attr_name, None)
+  if attr_value == None:
+    lit_config.fatal(
+      "No attribute %r in test configuration! You may need to run "
+      "tests from your build directory or add this attribute "
+      "to lit.site.cfg.py " % attr_name)
+  return attr_value
+
+def push_dynamic_library_lookup_path(config, new_path):
+  if platform.system() == 'Windows':
+    dynamic_library_lookup_var = 'PATH'
+  elif platform.system() == 'Darwin':
+    dynamic_library_lookup_var = 'DYLD_LIBRARY_PATH'
+  else:
+    dynamic_library_lookup_var = 'LD_LIBRARY_PATH'
+
+  new_ld_library_path = os.path.pathsep.join(
+    (new_path, config.environment.get(dynamic_library_lookup_var, '')))
+  config.environment[dynamic_library_lookup_var] = new_ld_library_path
+
+  if platform.system() == 'FreeBSD':
+    dynamic_library_lookup_var = 'LD_32_LIBRARY_PATH'
+    new_ld_32_library_path = os.path.pathsep.join(
+      (new_path, config.environment.get(dynamic_library_lookup_var, '')))
+    config.environment[dynamic_library_lookup_var] = new_ld_32_library_path
+
+  if platform.system() == 'SunOS':
+    dynamic_library_lookup_var = 'LD_LIBRARY_PATH_32'
+    new_ld_library_path_32 = os.path.pathsep.join(
+      (new_path, config.environment.get(dynamic_library_lookup_var, '')))
+    config.environment[dynamic_library_lookup_var] = new_ld_library_path_32
+
+    dynamic_library_lookup_var = 'LD_LIBRARY_PATH_64'
+    new_ld_library_path_64 = os.path.pathsep.join(
+      (new_path, config.environment.get(dynamic_library_lookup_var, '')))
+    config.environment[dynamic_library_lookup_var] = new_ld_library_path_64
+
+# Setup config name.
+config.name = 'TypeSanitizer' + config.name_suffix
+
+# Platform-specific default TYSAN_OPTIONS for lit tests.
+default_tysan_opts = list(config.default_sanitizer_opts)
+
+# On Darwin, leak checking is not enabled by default. Enable on macOS
+# tests to prevent regressions
+if config.host_os == 'Darwin' and config.apple_platform == 'osx':
+  default_tysan_opts += ['detect_leaks=1']
+
+default_tysan_opts_str = ':'.join(default_tysan_opts)
+if default_tysan_opts_str:
+  config.environment['TYSAN_OPTIONS'] = default_tysan_opts_str
+  default_tysan_opts_str += ':'
+config.substitutions.append(('%env_tysan_opts=',
+                             'env TYSAN_OPTIONS=' + default_tysan_opts_str))
+
+# Setup source root.
+config.test_source_root = os.path.dirname(__file__)
+
+if config.host_os not in ['FreeBSD', 'NetBSD']:
+  libdl_flag = "-ldl"
+else:
+  libdl_flag = ""
+
+# GCC-ASan doesn't link in all the necessary libraries automatically, so
+# we have to do it ourselves.
+if config.compiler_id == 'GNU':
+  extra_link_flags = ["-pthread", "-lstdc++", libdl_flag]
+else:
+  extra_link_flags = []
+
+# Setup default compiler flags used with -fsanitize=address option.
+# FIXME: Review the set of required flags and check if it can be reduced.
+target_cflags = [get_required_attr(config, "target_cflags")] + extra_link_flags
+target_cxxflags = config.cxx_mode_flags + target_cflags
+clang_tysan_static_cflags = (["-fsanitize=type",
+                            "-mno-omit-leaf-frame-pointer",
+                            "-fno-omit-frame-pointer",
+                            "-fno-optimize-sibling-calls"] +
+                            config.debug_info_flags + target_cflags)
+if config.target_arch == 's390x':
+  clang_tysan_static_cflags.append("-mbackchain")
+clang_tysan_static_cxxflags = config.cxx_mode_flags + clang_tysan_static_cflags
+
+clang_tysan_cflags = clang_tysan_static_cflags 
+clang_tysan_cxxflags = clang_tysan_static_cxxflags
+
+def build_invocation(compile_flags):
+  return " " + " ".join([config.clang] + compile_flags) + " "
+
+config.substitutions.append( ("%clang ", build_invocation(target_cflags)) )
+config.substitutions.append( ("%clangxx ", build_invocation(target_cxxflags)) )
+config.substitutions.append( ("%clang_tysan ", build_invocation(clang_tysan_cflags)) )
+config.substitutions.append( ("%clangxx_tysan ", build_invocation(clang_tysan_cxxflags)) )
+
+
+# FIXME: De-hardcode this path.
+tysan_source_dir = os.path.join(
+  get_required_attr(config, "compiler_rt_src_root"), "lib", "tysan")
+python_exec = sh_quote(get_required_attr(config, "python_executable"))
+
+# Set LD_LIBRARY_PATH to pick dynamic runtime up properly.
+push_dynamic_library_lookup_path(config, config.compiler_rt_libdir)
+
+# Default test suffixes.
+config.suffixes = ['.c', '.cpp']
+
+if config.host_os == 'Darwin':
+  config.suffixes.append('.mm')
+
+if config.host_os == 'Windows':
+  config.substitutions.append(('%fPIC', ''))
+  config.substitutions.append(('%fPIE', ''))
+  config.substitutions.append(('%pie', ''))
+else:
+  config.substitutions.append(('%fPIC', '-fPIC'))
+  config.substitutions.append(('%fPIE', '-fPIE'))
+  config.substitutions.append(('%pie', '-pie'))
+
+# Only run the tests on supported OSs.
+if config.host_os not in ['Linux', 'Darwin',]:
+  config.unsupported = True
diff --git a/compiler-rt/test/tysan/lit.site.cfg.py.in b/compiler-rt/test/tysan/lit.site.cfg.py.in
new file mode 100644
index 00000000000000..b56dce4fed7a26
--- /dev/null
+++ b/compiler-rt/test/tysan/lit.site.cfg.py.in
@@ -0,0 +1,17 @@
+ at LIT_SITE_CFG_IN_HEADER@
+
+# Tool-specific config options.
+config.name_suffix = "@TYSAN_TEST_CONFIG_SUFFIX@"
+config.target_cflags = "@TYSAN_TEST_TARGET_CFLAGS@"
+config.clang = "@TYSAN_TEST_TARGET_CC@"
+config.bits = "@TYSAN_TEST_BITS@"
+config.arm_thumb = "@COMPILER_RT_ARM_THUMB@"
+config.apple_platform = "@TYSAN_TEST_APPLE_PLATFORM@"
+config.apple_platform_min_deployment_target_flag = "@TYSAN_TEST_MIN_DEPLOYMENT_TARGET_FLAG@"
+config.target_arch = "@TYSAN_TEST_TARGET_ARCH@"
+
+# Load common config for all compiler-rt lit tests.
+lit_config.load_config(config, "@COMPILER_RT_BINARY_DIR@/test/lit.common.configured")
+
+# Load tool-specific config that would do the real work.
+lit_config.load_config(config, "@TYSAN_LIT_SOURCE_DIR@/lit.cfg.py")
diff --git a/compiler-rt/test/tysan/ptr-float.c b/compiler-rt/test/tysan/ptr-float.c
new file mode 100644
index 00000000000000..61fa5f1afd70ac
--- /dev/null
+++ b/compiler-rt/test/tysan/ptr-float.c
@@ -0,0 +1,19 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+float *P;
+void zero_array() {
+  int i;
+  for (i = 0; i < 1; ++i)
+    P[i] = 0.0f;
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: WRITE of size 4 at {{.*}} with type float accesses an existing object of type any pointer
+  // CHECK: {{#0 0x.* in zero_array .*ptr-float.c:}}[[@LINE-3]]
+}
+
+int main() {
+  P = (float *)&P;
+  zero_array();
+}
+
+// CHECK-NOT: ERROR: TypeSanitizer: type-aliasing-violation
diff --git a/compiler-rt/test/tysan/struct-offset-multiple-compilation-units.cpp b/compiler-rt/test/tysan/struct-offset-multiple-compilation-units.cpp
new file mode 100644
index 00000000000000..f7baa14d15affa
--- /dev/null
+++ b/compiler-rt/test/tysan/struct-offset-multiple-compilation-units.cpp
@@ -0,0 +1,51 @@
+// RUN: %clangxx_tysan -O0 %s -c -o %t.o
+// RUN: %clangxx_tysan -O0 %s -DPMAIN -c -o %tm.o
+// RUN: %clangxx_tysan -O0 %s -DPINIT -c -o %tinit.o
+// RUN: %clangxx_tysan -O0 %t.o %tm.o %tinit.o -o %t
+// RUN: %run %t 2>&1 | FileCheck %s
+
+#include <stdio.h>
+#include <stdlib.h>
+
+extern "C" {
+typedef struct X {
+  int *start;
+  int *end;
+  int i;
+} X;
+};
+
+#ifdef PMAIN
+int foo(struct X *);
+void bar(struct X *);
+void init(struct X *);
+
+int main() {
+  struct X x;
+  init(&x);
+  printf("%d\n", foo(&x));
+  free(x.start);
+  return 0;
+}
+
+#elif PINIT
+
+void init(struct X *x) {
+  x->start = (int *)calloc(100, sizeof(int));
+  x->end = x->start + 99;
+  x->i = 0;
+}
+
+#else
+
+__attribute__((noinline)) int foo(struct X *x) {
+  if (x->start < x->end)
+    return 30;
+  return 10;
+}
+
+void bar(struct X *x) { x->end = NULL; }
+
+#endif
+
+// CHECK-NOT: ERROR: TypeSanitizer: type-aliasing-violation
diff --git a/compiler-rt/test/tysan/struct-offset.c b/compiler-rt/test/tysan/struct-offset.c
new file mode 100644
index 00000000000000..7295e0ae121ed7
--- /dev/null
+++ b/compiler-rt/test/tysan/struct-offset.c
@@ -0,0 +1,26 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+#include <stdio.h>
+#include <stdlib.h>
+
+struct X {
+  int i;
+  int j;
+};
+
+int foo(struct X *p, struct X *q) {
+  q->j = 1;
+  p->i = 0;
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: WRITE of size 4 at {{.*}} with type int (in X at offset 0) accesses an existing object of type int (in X at offset 4)
+  // CHECK: {{#0 0x.* in foo .*struct-offset.c:}}[[@LINE-3]]
+  return q->j;
+}
+
+int main() {
+  unsigned char *p = malloc(3 * sizeof(int));
+  printf("%i\n", foo((struct X *)(p + sizeof(int)), (struct X *)p));
+}
+
+// CHECK-NOT: ERROR: TypeSanitizer: type-aliasing-violation
diff --git a/compiler-rt/test/tysan/struct.c b/compiler-rt/test/tysan/struct.c
new file mode 100644
index 00000000000000..f7ecef59676244
--- /dev/null
+++ b/compiler-rt/test/tysan/struct.c
@@ -0,0 +1,39 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+#include <stdio.h>
+
+typedef struct S1 {
+  int i1;
+} s1;
+typedef struct S2 {
+  int i2;
+} s2;
+
+void g(int *i) {
+  *i = 5;
+  printf("%i\n", *i);
+}
+
+void h(char *c) {
+  *c = 5;
+  printf("%i\n", (int)*c);
+}
+
+void f(s1 *s1p, s2 *s2p) {
+  s1p->i1 = 2;
+  s2p->i2 = 3;
+  // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
+  // CHECK: WRITE of size 4 at {{.*}} with type int (in S2 at offset 0) accesses an existing object of type int (in S1 at offset 0)
+  // CHECK: {{#0 0x.* in f .*struct.c:}}[[@LINE-3]]
+  printf("%i\n", s1p->i1);
+}
+
+int main() {
+  s1 s = {.i1 = 1};
+  f(&s, (s2 *)&s);
+  g(&s.i1);
+  h((char *)&s.i1);
+}
+
+// CHECK-NOT: ERROR: TypeSanitizer: type-aliasing-violation
diff --git a/compiler-rt/test/tysan/union-wr-wr.c b/compiler-rt/test/tysan/union-wr-wr.c
new file mode 100644
index 00000000000000..6414bbfcf9d95b
--- /dev/null
+++ b/compiler-rt/test/tysan/union-wr-wr.c
@@ -0,0 +1,18 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+#include <stdio.h>
+
+// CHECK-NOT: ERROR: TypeSanitizer: type-aliasing-violation
+
+int main() {
+  union {
+    int i;
+    short s;
+  } u;
+
+  u.i = 42;
+  u.s = 1;
+
+  printf("%d\n", u.i);
+}
diff --git a/compiler-rt/test/tysan/violation-pr45282.c b/compiler-rt/test/tysan/violation-pr45282.c
new file mode 100644
index 00000000000000..f3583d6be6f6a3
--- /dev/null
+++ b/compiler-rt/test/tysan/violation-pr45282.c
@@ -0,0 +1,32 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// https://github.com/llvm/llvm-project/issues/45282
+
+#include <stdio.h>
+
+int main(void) {
+
+  double a[29], b[20];
+  int i, j;
+
+  for (i = 0; i < 20; ++i) {
+    b[i] = 2.01f + 1.f;
+    ((float *)a)[i] = 2.01f * 2.0145f;
+    ((float *)a + 38)[i] = 2.01f * 1.0123f;
+  }
+
+  // CHECK:      TypeSanitizer: type-aliasing-violation on address
+  // CHECK-NEXT: WRITE of size 8 at {{.+}} with type double accesses an existing object of type float
+  // CHECK-NEXT:   in main violation-pr45282.c:25
+
+  // loop of problems
+  for (j = 2; j <= 4; ++j) {
+    a[j - 1] = ((float *)a)[j] * ((float *)a + 38)[j - 1];
+    ((float *)a + 38)[j - 1] = ((float *)a)[j - 1] + b[j - 1];
+  }
+
+  printf("((float *)a + 38)[2] = %f\n", ((float *)a + 38)[2]);
+
+  return 0;
+}
diff --git a/compiler-rt/test/tysan/violation-pr47137.c b/compiler-rt/test/tysan/violation-pr47137.c
new file mode 100644
index 00000000000000..3987128ff6fc67
--- /dev/null
+++ b/compiler-rt/test/tysan/violation-pr47137.c
@@ -0,0 +1,40 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// https://github.com/llvm/llvm-project/issues/47137
+#include <stdio.h>
+#include <stdlib.h>
+
+void f(int m) {
+  int n = (4 * m + 2) / 3;
+  uint64_t *a = malloc(n * sizeof(uint64_t));
+  uint64_t *b = malloc(n * sizeof(uint64_t));
+  uint64_t aa[] = {0xffff3e0000000001, 0x22eaf0b680a88c16, 0x5a65d25ac40e20f3,
+                   0x34e7ac346236953e, 0x9dea3e0a26c6ba89, 0x0000000000000000,
+                   0x0000000000000000, 0x0000000000000000};
+  uint64_t bb[] = {0x0000000024c0ffff, 0x000000004634d940, 0x00000000219d18ef,
+                   0x0000000000154519, 0x000000000000035f, 0x0000000000000000,
+                   0x0000000000000000, 0x0000000000000000};
+  char l[20];
+  l[0] = 0;
+  for (int i = 0; i < n; i++) {
+    a[i] = aa[i] + l[0] - '0';
+    b[i] = bb[i] + l[0] - '0';
+  }
+
+  // CHECK:      TypeSanitizer: type-aliasing-violation on address
+  // CHECK-NEXT: READ of size 2 at {{.+}} with type short accesses an existing object of type long long
+  // CHECK-NEXT:    in f violation-pr47137.c:30
+  for (int i = 0, j = 0; j < 4 * m; i += 4, j += 3) {
+    for (int k = 0; k < 3; k++) {
+      ((uint16_t *)a)[j + k] = ((uint16_t *)a)[i + k];
+      ((uint16_t *)b)[j + k] = ((uint16_t *)b)[i + k];
+    }
+  }
+
+  printf("a: %016llx\n", a[0]);
+  free(a);
+  free(b);
+}
+
+int main() { f(6); }
diff --git a/compiler-rt/test/tysan/violation-pr51837.c b/compiler-rt/test/tysan/violation-pr51837.c
new file mode 100644
index 00000000000000..d49a813933d653
--- /dev/null
+++ b/compiler-rt/test/tysan/violation-pr51837.c
@@ -0,0 +1,34 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+#include <stdint.h>
+#include <stdio.h>
+
+// CHECK-NOT: TypeSanitizer
+
+union a {
+  int16_t b;
+  uint64_t c;
+} d;
+
+uint64_t *e = &d.c;
+static uint16_t f(int16_t a, int32_t b, uint64_t c);
+static int64_t g(int32_t aa, uint8_t h, union a bb) {
+  int16_t *i = &d.b;
+  f(0, h, 0);
+  *i = h;
+  return 0;
+}
+uint16_t f(int16_t a, int32_t b, uint64_t c) {
+  for (d.c = 0; 0;)
+    ;
+  *e = 0;
+  return 0;
+}
+
+int main() {
+  uint32_t j = 8;
+  g(1, j, d);
+  printf("%d\n", d.b);
+  return 0;
+}
diff --git a/compiler-rt/test/tysan/violation-pr62544.c b/compiler-rt/test/tysan/violation-pr62544.c
new file mode 100644
index 00000000000000..30610925ba385f
--- /dev/null
+++ b/compiler-rt/test/tysan/violation-pr62544.c
@@ -0,0 +1,24 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// https://github.com/llvm/llvm-project/issues/62544
+
+int printf(const char *, ...);
+int a, b, c;
+long d;
+int main() {
+  short *e = &a;
+  int *f = &a;
+  *f = 0;
+  for (; b <= 9; b++) {
+    int **g = &f;
+    *f = d;
+    *g = &c;
+  }
+
+  // CHECK:      TypeSanitizer: type-aliasing-violation on address
+  // CHECK-NEXT: WRITE of size 2 at {{.+}} with type short accesses an existing object of type int
+  // CHECK-NEXT:   in main violation-pr62544.c:22
+  *e = 3;
+  printf("%d\n", a);
+}
diff --git a/compiler-rt/test/tysan/violation-pr62828.cpp b/compiler-rt/test/tysan/violation-pr62828.cpp
new file mode 100644
index 00000000000000..33003df9761f52
--- /dev/null
+++ b/compiler-rt/test/tysan/violation-pr62828.cpp
@@ -0,0 +1,44 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// https://github.com/llvm/llvm-project/issues/62828
+#include <stdio.h>
+
+typedef int int_v8[8];
+typedef short short_v8[8];
+short *test1(int_v8 *cast_c_array, short_v8 *shuf_c_array1, int *ptr) {
+  int *input1 = reinterpret_cast<int *>(((int_v8 *)(cast_c_array)));
+  short *input2 = reinterpret_cast<short *>(reinterpret_cast<int_v8 *>(input1));
+
+  short *output1 = reinterpret_cast<short *>(((short_v8 *)(shuf_c_array1)));
+  short *output2 =
+      reinterpret_cast<short *>(reinterpret_cast<short_v8 *>(output1));
+
+  for (int r = 0; r < 8; ++r) {
+    int tmp = (int)((r * 4) + ptr[r]);
+    if ((ptr[r] / 4) == 0) {
+      int *input = reinterpret_cast<int *>(((int_v8 *)(cast_c_array)));
+      input[r] = tmp;
+    }
+  }
+
+  // CHECK:      ERROR: TypeSanitizer: type-aliasing-violation on address
+  // CHECK-NEXT: READ of size 2 at {{.+}} with type short accesses an existing object of type int
+  // CHECK-NEXT:    in test1(int (*) [8], short (*) [8], int*) violation-pr62828.cpp:29
+  for (int i3 = 0; i3 < 4; ++i3) {
+    output2[i3] = input2[(i3 * 2)];
+  }
+  return output2;
+}
+
+int main() {
+  int_v8 in[4] = {{4, 4, 4, 4}};
+  short_v8 out[4] = {{0}};
+  int ptr[8] = {2};
+  test1(in, out, ptr);
+  short *p = reinterpret_cast<short *>(out);
+  for (int i = 0; i < 32; i++) {
+    printf("%d ", p[i]);
+  }
+  return 0;
+}
diff --git a/compiler-rt/test/tysan/violation-pr68655.cpp b/compiler-rt/test/tysan/violation-pr68655.cpp
new file mode 100644
index 00000000000000..ac20f8c94e1ffd
--- /dev/null
+++ b/compiler-rt/test/tysan/violation-pr68655.cpp
@@ -0,0 +1,40 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+// https://github.com/llvm/llvm-project/issues/68655
+struct S1 {
+  long long a;
+  long long b;
+};
+
+// CHECK: TypeSanitizer: type-aliasing-violation on address
+// CHECK-NEXT:  READ of size 4 at {{.+}} with type int accesses an existing object of type long long (in S1 at offset 0)
+// CHECK-NEXT: in copyMem(S1*, S1*) violation-pr68655.cpp:19
+
+void inline copyMem(S1 *dst, S1 *src) {
+  unsigned *d = reinterpret_cast<unsigned *>(dst);
+  unsigned *s = reinterpret_cast<unsigned *>(src);
+
+  for (int i = 0; i < sizeof(S1) / sizeof(unsigned); i++) {
+    *d = *s;
+    d++;
+    s++;
+  }
+}
+
+void math(S1 *dst, int *srcA, int idx_t) {
+  S1 zero[4];
+  for (int i = 0; i < 2; i++) {
+    zero[i].a = i + idx_t;
+    zero[i].b = i * idx_t;
+  }
+
+  copyMem(&dst[idx_t], &zero[srcA[idx_t]]);
+}
+
+int main() {
+  S1 dst = {0};
+  int Src[2] = {0, 0};
+  math(&dst, &Src[0], 0);
+  return 0;
+}
diff --git a/compiler-rt/test/tysan/violation-pr86685.c b/compiler-rt/test/tysan/violation-pr86685.c
new file mode 100644
index 00000000000000..fe4fd82af5fdd2
--- /dev/null
+++ b/compiler-rt/test/tysan/violation-pr86685.c
@@ -0,0 +1,29 @@
+// RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
+// RUN: FileCheck %s < %t.out
+
+#include <stdio.h>
+#include <stdlib.h>
+
+// Violation reported in https://github.com/llvm/llvm-project/issues/86685.
+void foo(int *s, float *f, long n) {
+  for (long i = 0; i < n; ++i) {
+    *f = 2;
+    if (i == 1)
+      break;
+
+    // CHECK:      TypeSanitizer: type-aliasing-violation on address
+    // CHECK-NEXT: WRITE of size 4 at {{.+}} with type int accesses an existing object of type float
+    // CHECK-NEXT:   #0 {{.+}} in foo violation-pr86685.c:17
+    *s = 4;
+  }
+}
+
+int main(void) {
+  union {
+    int s;
+    float f;
+  } u = {0};
+  foo(&u.s, &u.f, 2);
+  printf("%.f\n", u.f);
+  return 0;
+}

>From 625005b6d41eff9d9dc0e564d6233a5af7fcc462 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Fri, 6 Dec 2024 12:08:52 +0000
Subject: [PATCH 18/18] !fixup update tests

---
 compiler-rt/test/tysan/constexpr-subobject.cpp | 2 +-
 compiler-rt/test/tysan/ptr-float.c             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/test/tysan/constexpr-subobject.cpp b/compiler-rt/test/tysan/constexpr-subobject.cpp
index 9cae310554c9b4..c473ffe5e445bd 100644
--- a/compiler-rt/test/tysan/constexpr-subobject.cpp
+++ b/compiler-rt/test/tysan/constexpr-subobject.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_tysan -O0 %s -o %t && %run %t >%t.out 2>&1
-// RUN: FileCheck %s < %t.out
+// RUN: FileCheck --allow-empty %s < %t.out
 
 // CHECK-NOT: TypeSanitizer
 
diff --git a/compiler-rt/test/tysan/ptr-float.c b/compiler-rt/test/tysan/ptr-float.c
index 61fa5f1afd70ac..aaa98959869886 100644
--- a/compiler-rt/test/tysan/ptr-float.c
+++ b/compiler-rt/test/tysan/ptr-float.c
@@ -7,7 +7,7 @@ void zero_array() {
   for (i = 0; i < 1; ++i)
     P[i] = 0.0f;
   // CHECK: ERROR: TypeSanitizer: type-aliasing-violation
-  // CHECK: WRITE of size 4 at {{.*}} with type float accesses an existing object of type any pointer
+  // CHECK: WRITE of size 4 at {{.*}} with type float accesses an existing object of type p1 float
   // CHECK: {{#0 0x.* in zero_array .*ptr-float.c:}}[[@LINE-3]]
 }