[compiler-rt] [hwasan] Fix and re-enable deep-recursion.c (PR #69265)

Mon Oct 16 16:54:42 PDT 2023

https://github.com/thurstond updated https://github.com/llvm/llvm-project/pull/69265

>From 999d8fa570a4ca6aa5f8e2ba0233edfc4ddec357 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 16 Oct 2023 23:05:53 +0000
Subject: [PATCH 01/14] [hwasan] Fix and re-enable deep-recursion.c

deep-recursion.c was disabled
(https://github.com/llvm/llvm-project/commit/c007e0f66ee3f96467fd12f6200218fb4c38c2c9)
because the test may get unlucky and end up with a zero-tagged variable, leading to a false negative (https://github.com/llvm/llvm-project/issues/69221).

This patch re-enables the test and adds a workaround: it checks
if the variable is zero-tagged, and if so, it will instead use the
neighboring variable, which must have a different (hence non-zero)
tag.

Fixing the stack allocation tagging is left as an exercise for the
reader. It is non-trivial because, even if the stackTagBase is
non-zero, tags for subsequent allocations in the stack frame may wrap
around to zero; working around this would require adding multiple
instructions to each alloca.
---
 .../test/hwasan/TestCases/deep-recursion.c    | 21 +++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/test/hwasan/TestCases/deep-recursion.c b/compiler-rt/test/hwasan/TestCases/deep-recursion.c
index bf390d051d472e7..c992f205917ba22 100644
--- a/compiler-rt/test/hwasan/TestCases/deep-recursion.c
+++ b/compiler-rt/test/hwasan/TestCases/deep-recursion.c
@@ -17,9 +17,6 @@
 // Stack histories are currently not recorded on x86.
 // XFAIL: target=x86_64{{.*}}
 
-// Flaky on AArch64 Linux, see https://github.com/llvm/llvm-project/issues/69221.
-// UNSUPPORTED: target=aarch64{{.*}}
-
 #include <stdlib.h>
 // At least -O1 is needed for this function to not have a stack frame on
 // AArch64.
@@ -29,7 +26,23 @@ void USE(void *x) { // pretend_to_do_something(void *x)
 
 volatile int four = 4;
 
-__attribute__((noinline)) void OOB() { int x[4]; x[four] = 0; USE(&x[0]); }
+__attribute__((noinline)) void OOB() {
+  int x[4];
+  int y[4];
+
+  // Tags for stack-allocated variables can occasionally be zero, resulting in
+  // a false negative for this test. This is not easy to fix, hence we work
+  // around it: if the tag is zero, we use the neighboring variable instead,
+  // which must have a different (hence non-zero) tag.
+  // This tag check assumes aarch64.
+  if(((unsigned long)&x) >> 56 == 0) {
+    y[four] = 0;
+  } else {
+    x[four] = 0;
+  }
+  USE(&x[0]);
+  USE(&y[0]);
+}
 __attribute__((noinline)) void FUNC1() { int x; USE(&x); OOB(); }
 __attribute__((noinline)) void FUNC2() { int x; USE(&x); FUNC1(); }
 __attribute__((noinline)) void FUNC3() { int x; USE(&x); FUNC2(); }

>From db33a7b07b89610b598059cf973f21fff9dd1c6f Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler at users.noreply.github.com>
Date: Mon, 16 Oct 2023 15:55:33 -0700
Subject: [PATCH 02/14] [flang] Avoid needless overflow when folding NORM2
 (#67499)

The code that folds the relatively new NORM2 intrinsic function can
produce overflow in cases where it's not warranted. Rearrange to NORM2 =
M * SQRT((A(:)/M)**2) where M is MAXVAL(ABS(A)).
---
 flang/lib/Evaluate/fold-real.cpp    | 28 ++++++++++++++++++++++------
 flang/lib/Evaluate/fold-reduction.h |  2 +-
 flang/test/Evaluate/fold-norm2.f90  | 13 ++++++++++---
 3 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/flang/lib/Evaluate/fold-real.cpp b/flang/lib/Evaluate/fold-real.cpp
index 8e3ab1d8fd30b09..6bcc3ec73982157 100644
--- a/flang/lib/Evaluate/fold-real.cpp
+++ b/flang/lib/Evaluate/fold-real.cpp
@@ -52,15 +52,28 @@ template <int KIND> class Norm2Accumulator {
       const Constant<T> &array, const Constant<T> &maxAbs, Rounding rounding)
       : array_{array}, maxAbs_{maxAbs}, rounding_{rounding} {};
   void operator()(Scalar<T> &element, const ConstantSubscripts &at) {
-    // Kahan summation of scaled elements
+    // Kahan summation of scaled elements:
+    // Naively,
+    //   NORM2(A(:)) = SQRT(SUM(A(:)**2))
+    // For any T > 0, we have mathematically
+    //   SQRT(SUM(A(:)**2))
+    //     = SQRT(T**2 * (SUM(A(:)**2) / T**2))
+    //     = SQRT(T**2 * SUM(A(:)**2 / T**2))
+    //     = SQRT(T**2 * SUM((A(:)/T)**2))
+    //     = SQRT(T**2) * SQRT(SUM((A(:)/T)**2))
+    //     = T * SQRT(SUM((A(:)/T)**2))
+    // By letting T = MAXVAL(ABS(A)), we ensure that
+    // ALL(ABS(A(:)/T) <= 1), so ALL((A(:)/T)**2 <= 1), and the SUM will
+    // not overflow unless absolutely necessary.
     auto scale{maxAbs_.At(maxAbsAt_)};
     if (scale.IsZero()) {
-      // If maxAbs is zero, so are all elements, and result
+      // Maximum value is zero, and so will the result be.
+      // Avoid division by zero below.
       element = scale;
     } else {
       auto item{array_.At(at)};
       auto scaled{item.Divide(scale).value};
-      auto square{item.Multiply(scaled).value};
+      auto square{scaled.Multiply(scaled).value};
       auto next{square.Add(correction_, rounding_)};
       overflow_ |= next.flags.test(RealFlag::Overflow);
       auto sum{element.Add(next.value, rounding_)};
@@ -73,13 +86,16 @@ template <int KIND> class Norm2Accumulator {
   }
   bool overflow() const { return overflow_; }
   void Done(Scalar<T> &result) {
+    // result+correction == SUM((data(:)/maxAbs)**2)
+    // result = maxAbs * SQRT(result+correction)
     auto corrected{result.Add(correction_, rounding_)};
     overflow_ |= corrected.flags.test(RealFlag::Overflow);
     correction_ = Scalar<T>{};
-    auto rescaled{corrected.value.Multiply(maxAbs_.At(maxAbsAt_))};
+    auto root{corrected.value.SQRT().value};
+    auto product{root.Multiply(maxAbs_.At(maxAbsAt_))};
     maxAbs_.IncrementSubscripts(maxAbsAt_);
-    overflow_ |= rescaled.flags.test(RealFlag::Overflow);
-    result = rescaled.value.SQRT().value;
+    overflow_ |= product.flags.test(RealFlag::Overflow);
+    result = product.value;
   }
 
 private:
diff --git a/flang/lib/Evaluate/fold-reduction.h b/flang/lib/Evaluate/fold-reduction.h
index cff7f54c60d91ba..0dd55124e6a512e 100644
--- a/flang/lib/Evaluate/fold-reduction.h
+++ b/flang/lib/Evaluate/fold-reduction.h
@@ -228,7 +228,7 @@ template <typename T, bool ABS = false> class MaxvalMinvalAccumulator {
         test.Rewrite(context_, std::move(test)))};
     CHECK(folded.has_value());
     if (folded->IsTrue()) {
-      element = array_.At(at);
+      element = aAt;
     }
   }
   void Done(Scalar<T> &) const {}
diff --git a/flang/test/Evaluate/fold-norm2.f90 b/flang/test/Evaluate/fold-norm2.f90
index 30d5289b5a6e33c..370532bafaa13cf 100644
--- a/flang/test/Evaluate/fold-norm2.f90
+++ b/flang/test/Evaluate/fold-norm2.f90
@@ -17,13 +17,20 @@ module m
   real(dp), parameter :: a(3,4) = &
     reshape([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], shape(a))
   real(dp), parameter :: nAll = norm2(a)
-  real(dp), parameter :: check_nAll = sqrt(sum(a * a))
+  real(dp), parameter :: check_nAll = 11._dp * sqrt(sum((a/11._dp)**2))
   logical, parameter :: test_all = nAll == check_nAll
   real(dp), parameter :: norms1(4) = norm2(a, dim=1)
-  real(dp), parameter :: check_norms1(4) = sqrt(sum(a * a, dim=1))
+  real(dp), parameter :: check_norms1(4) = [ &
+    2.236067977499789805051477742381393909454345703125_8, &
+    7.07106781186547550532850436866283416748046875_8, &
+    1.2206555615733702069292121450416743755340576171875e1_8, &
+    1.7378147196982769884243680280633270740509033203125e1_8 ]
   logical, parameter :: test_norms1 = all(norms1 == check_norms1)
   real(dp), parameter :: norms2(3) = norm2(a, dim=2)
-  real(dp), parameter :: check_norms2(3) = sqrt(sum(a * a, dim=2))
+  real(dp), parameter :: check_norms2(3) = [ &
+    1.1224972160321822656214862945489585399627685546875e1_8, &
+    1.28840987267251261272349438513629138469696044921875e1_8, &
+    1.4628738838327791427218471653759479522705078125e1_8 ]
   logical, parameter :: test_norms2 = all(norms2 == check_norms2)
   logical, parameter :: test_normZ = norm2([0.,0.,0.]) == 0.
 end

>From a0f824af4744f2e3cf643e6830c8958527813873 Mon Sep 17 00:00:00 2001
From: Kirill Stoimenov <87100199+kstoimenov at users.noreply.github.com>
Date: Mon, 16 Oct 2023 16:09:44 -0700
Subject: [PATCH 03/14] [HWASAN] Add bcmp interceptor (#69257)

---
 .../lib/hwasan/hwasan_platform_interceptors.h |  4 +--
 compiler-rt/test/hwasan/TestCases/bcmp.cpp    | 27 +++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)
 create mode 100644 compiler-rt/test/hwasan/TestCases/bcmp.cpp

diff --git a/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h b/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h
index 390c9d80c38edd9..86d26b5ac12d4a7 100644
--- a/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h
+++ b/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h
@@ -68,8 +68,8 @@
 // #undef SANITIZER_INTERCEPT_MEMCMP
 // #define SANITIZER_INTERCEPT_MEMCMP 0
 
-#undef SANITIZER_INTERCEPT_BCMP
-#define SANITIZER_INTERCEPT_BCMP 0
+// #undef SANITIZER_INTERCEPT_BCMP
+// #define SANITIZER_INTERCEPT_BCMP 0
 
 #undef SANITIZER_INTERCEPT_STRNDUP
 #define SANITIZER_INTERCEPT_STRNDUP 0
diff --git a/compiler-rt/test/hwasan/TestCases/bcmp.cpp b/compiler-rt/test/hwasan/TestCases/bcmp.cpp
new file mode 100644
index 000000000000000..3dee4b8490efc06
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/bcmp.cpp
@@ -0,0 +1,27 @@
+// RUN: %clangxx_hwasan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx_hwasan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx_hwasan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx_hwasan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s
+
+#include <sanitizer/hwasan_interface.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+int main(int argc, char **argv) {
+  __hwasan_enable_allocator_tagging();
+  char a[] = {static_cast<char>(argc), 2, 3, 4};
+  int size = sizeof(a);
+  char *p = (char *)malloc(size);
+  memcpy(p, a, size);
+  free(p);
+  return bcmp(p, a, size);
+  // CHECK: HWAddressSanitizer: tag-mismatch on address
+  // CHECK: READ of size 4
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}bcmp.cpp:[[@LINE-3]]
+  // CHECK: Cause: use-after-free
+  // CHECK: freed by thread
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}bcmp.cpp:[[@LINE-7]]
+  // CHECK: previously allocated by thread
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}bcmp.cpp:[[@LINE-11]]
+}

>From ce073317d639692b06540dcfdfd4726126f16526 Mon Sep 17 00:00:00 2001
From: Tai Ly <tai.ly at arm.com>
Date: Mon, 16 Oct 2023 18:10:17 -0500
Subject: [PATCH 04/14] [TOSA] Add StatefulOps to TOSA Dialect (#66843)

This patch adds tosa.variable, tosa.variable.read and
tosa.variable.write operators and tests.


Change-Id: I647e2e5c3762d7890b03f6aa7c09a29198b7d355

---------

Signed-off-by: Jerry Ge <jerry.ge at arm.com>
Co-authored-by: Jerry Ge <jerry.ge at arm.com>
---
 .../Conversion/TosaToLinalg/TosaToLinalg.h    |  4 +-
 mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h   |  5 +
 .../mlir/Dialect/Tosa/IR/TosaUtilOps.td       | 67 ++++++++++++++
 .../mlir/Dialect/Tosa/Transforms/Passes.h     |  3 -
 .../mlir/Dialect/Tosa/Transforms/Passes.td    |  3 +-
 .../TosaToLinalg/TosaToLinalgPass.cpp         |  5 +-
 mlir/lib/Dialect/Tosa/IR/TosaOps.cpp          | 43 +++++++++
 .../Tosa/Transforms/TosaValidation.cpp        | 92 +++++++++++++++++--
 mlir/test/Dialect/Tosa/invalid.mlir           | 45 +++++++++
 mlir/test/Dialect/Tosa/variables.mlir         | 33 +++++++
 10 files changed, 281 insertions(+), 19 deletions(-)
 create mode 100644 mlir/test/Dialect/Tosa/variables.mlir

diff --git a/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h b/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
index d8d4027500f99c6..c411010603ac61f 100644
--- a/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
+++ b/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
@@ -35,8 +35,8 @@ std::unique_ptr<Pass> createTosaToLinalgNamed();
 void addTosaToLinalgPasses(
     OpPassManager &pm, const TosaToLinalgOptions &options,
     // Note: Default to 'none' level unless otherwise specified.
-    tosa::ValidationOptions const &validationOptions =
-        tosa::ValidationOptions().setLevel(tosa::TosaLevelEnum::None));
+    tosa::TosaValidationOptions const &validationOptions = {
+        tosa::TosaProfileEnum::Undefined, false, tosa::TosaLevelEnum::None});
 
 /// Populates conversion passes from TOSA dialect to Linalg dialect.
 void populateTosaToLinalgConversionPatterns(RewritePatternSet *patterns);
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h
index 555d9bea18ba4dc..a9bc3351f4cff05 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.h
@@ -34,6 +34,11 @@ class PatternRewriter;
 
 namespace tosa {
 
+ParseResult parseTypeOrAttr(OpAsmParser &parser, TypeAttr &typeAttr,
+                            Attribute &attr);
+void printTypeOrAttr(OpAsmPrinter &p, Operation *op, TypeAttr type,
+                     Attribute attr);
+
 #include "mlir/Dialect/Tosa/IR/TosaInterfaces.h.inc"
 
 } // namespace tosa
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaUtilOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaUtilOps.td
index d75f5dffa8716c9..f9f25da1b649dea 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaUtilOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaUtilOps.td
@@ -79,4 +79,71 @@ def Tosa_YieldOp : Tosa_Op<"yield", [
   let assemblyFormat = "$inputs attr-dict `:` type($inputs)";
 }
 
+//===----------------------------------------------------------------------===//
+// Operator: variable
+//===----------------------------------------------------------------------===//
+def Tosa_VariableOp : Tosa_Op<"variable", []> {
+  let summary = "Defines a variable";
+
+  let description = [{
+    Defines a new TOSA variable. This is a mutable value.
+    Modifications are expressed using read/write semantics.
+  }];
+
+  let arguments = (ins
+    SymbolNameAttr:$name,
+    TypeAttr:$type,
+    OptionalAttr<AnyAttr>:$initial_value
+  );
+
+  let assemblyFormat = [{
+    $name
+    attr-dict
+    custom<TypeOrAttr>($type, $initial_value)
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Operator: variable.write
+//===----------------------------------------------------------------------===//
+def Tosa_VariableWriteOp : Tosa_Op<"variable.write", []> {
+  let summary = "write_buffer operator";
+
+  let description = [{
+    Assigns a value to pseudo-buffer resource holding a mutable tensor.
+  }];
+
+  let arguments = (ins
+    SymbolNameAttr:$name,
+    AnyType:$value
+  );
+
+  let assemblyFormat = [{
+    $name attr-dict `,` $value `:` type($value)
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Operator: variable.read
+//===----------------------------------------------------------------------===//
+def Tosa_VariableReadOp : Tosa_Op<"variable.read", []> {
+  let summary = "read_buffer operator";
+
+  let description = [{
+    Reads the value from a pseudo-buffer resource holding a mutable tensor.
+  }];
+
+  let arguments = (ins
+    SymbolNameAttr:$name
+  );
+
+  let results = (outs
+    AnyType:$value
+  );
+
+  let assemblyFormat = [{
+    $name attr-dict `:` type($value)
+  }];
+}
+
 #endif // TOSA_UTIL_OPS
diff --git a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h
index 940aed107e2f916..fbfc56dfe2cf4f1 100644
--- a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.h
@@ -68,9 +68,6 @@ struct ValidationOptions {
   }
 };
 
-std::unique_ptr<Pass> createTosaValidationPass(
-    ValidationOptions const &options = ValidationOptions());
-
 #define GEN_PASS_REGISTRATION
 #include "mlir/Dialect/Tosa/Transforms/Passes.h.inc"
 
diff --git a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
index ac100a6d75c7c08..a0f670de20150fb 100644
--- a/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Tosa/Transforms/Passes.td
@@ -89,13 +89,12 @@ def TosaLevelType : I32EnumAttr<"TosaLevelEnum", "Tosa level",
   let cppNamespace = "mlir::tosa";
 }
 
-def TosaValidation : Pass<"tosa-validate", "func::FuncOp"> {
+def TosaValidation : Pass<"tosa-validate", "mlir::ModuleOp"> {
   let summary = "Validates TOSA dialect";
   let description = [{
     This pass validates if input TOSA operations match the specification for given
     criteria, e.g. TOSA profile.
   }];
-  let constructor = "createTosaValidationPass()";
 
   let options = [
       Option<"profile", "profile", "mlir::tosa::TosaProfileEnum",
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
index 718e34ced8d7e70..3c54f85b033b0b6 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
@@ -76,7 +76,7 @@ std::unique_ptr<Pass> mlir::tosa::createTosaToLinalg() {
 
 void mlir::tosa::addTosaToLinalgPasses(
     OpPassManager &pm, const TosaToLinalgOptions &options,
-    tosa::ValidationOptions const &validationOptions) {
+    tosa::TosaValidationOptions const &validationOptions) {
   // Optional decompositions are designed to benefit linalg.
   if (!options.disableTosaDecompositions)
     pm.addNestedPass<func::FuncOp>(tosa::createTosaOptionalDecompositions());
@@ -90,7 +90,6 @@ void mlir::tosa::addTosaToLinalgPasses(
   pm.addNestedPass<func::FuncOp>(tosa::createTosaLayerwiseConstantFoldPass(
       {options.aggressiveReduceConstant}));
   pm.addNestedPass<func::FuncOp>(tosa::createTosaMakeBroadcastablePass());
-  pm.addNestedPass<func::FuncOp>(
-      tosa::createTosaValidationPass(validationOptions));
+  pm.addNestedPass<func::FuncOp>(tosa::createTosaValidation(validationOptions));
   pm.addNestedPass<func::FuncOp>(tosa::createTosaToLinalg());
 }
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 6db04fe38bcd356..ff34183f9a030a8 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -146,6 +146,49 @@ Operation *TosaDialect::materializeConstant(OpBuilder &builder, Attribute value,
   return nullptr;
 }
 
+//===----------------------------------------------------------------------===//
+// Parsers and printers
+//===----------------------------------------------------------------------===//
+
+ParseResult mlir::tosa::parseTypeOrAttr(OpAsmParser &parser, TypeAttr &typeAttr,
+                                        Attribute &attr) {
+  if (succeeded(parser.parseOptionalEqual())) {
+    if (failed(parser.parseAttribute(attr))) {
+      return parser.emitError(parser.getCurrentLocation())
+             << "expected attribute";
+    }
+    if (auto typedAttr = attr.dyn_cast<TypedAttr>()) {
+      typeAttr = TypeAttr::get(typedAttr.getType());
+    }
+    return success();
+  }
+
+  Type type;
+  if (failed(parser.parseColonType(type))) {
+    return parser.emitError(parser.getCurrentLocation()) << "expected type";
+  }
+  typeAttr = TypeAttr::get(type);
+
+  return success();
+}
+
+void mlir::tosa::printTypeOrAttr(OpAsmPrinter &p, Operation *op, TypeAttr type,
+                                 Attribute attr) {
+  bool needsSpace = false;
+  auto typedAttr = attr.dyn_cast_or_null<TypedAttr>();
+  if (!typedAttr || typedAttr.getType() != type.getValue()) {
+    p << ": ";
+    p.printAttribute(type);
+    needsSpace = true; // subsequent attr value needs a space separator
+  }
+  if (attr) {
+    if (needsSpace)
+      p << ' ';
+    p << "= ";
+    p.printAttribute(attr);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // TOSA Operator Verifiers.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
index 52885e69c3924f2..d686ce125c13516 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
@@ -14,6 +14,9 @@
 #include "mlir/Dialect/Tosa/Transforms/Passes.h"
 #include "mlir/Dialect/Tosa/Transforms/PassesEnums.cpp.inc"
 
+#include <string>
+#include <unordered_map>
+
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/IR/Builders.h"
@@ -96,12 +99,13 @@ static constexpr tosa_level_t TOSA_LEVEL_NONE = {0, 0, 0, 0};
 struct TosaValidation : public tosa::impl::TosaValidationBase<TosaValidation> {
 public:
   explicit TosaValidation() { populateConstantOperandChecks(); }
-  explicit TosaValidation(const ValidationOptions &options) : TosaValidation() {
+  explicit TosaValidation(const TosaValidationOptions &options)
+      : TosaValidation() {
     this->profile = options.profile;
-    this->StrictOperationSpecAlignment = options.strictOperationSpecAlignment;
+    this->StrictOperationSpecAlignment = options.StrictOperationSpecAlignment;
     this->level = options.level;
   }
-  void runOnOperation() override;
+  void runOnOperation() final;
 
   LogicalResult applyConstantOperandCheck(Operation *op) {
     for (auto &checker : const_checkers) {
@@ -113,6 +117,9 @@ struct TosaValidation : public tosa::impl::TosaValidationBase<TosaValidation> {
 
   LogicalResult applyLevelCheck(Operation *op);
 
+  // check variable read/write data types against variable declarations
+  LogicalResult applyVariableCheck(Operation *op);
+
 private:
   void populateConstantOperandChecks() {
     const_checkers.emplace_back(checkConstantOperandPad);
@@ -398,8 +405,12 @@ struct TosaValidation : public tosa::impl::TosaValidationBase<TosaValidation> {
     }
   }
 
+  bool CheckVariable(Operation *op);
+  bool CheckVariableReadOrWrite(Operation *op);
+
   SmallVector<std::function<LogicalResult(Operation *)>> const_checkers;
   tosa_level_t tosa_level;
+  DenseMap<const mlir::StringAttr *, mlir::Type> variables_map;
 };
 
 LogicalResult TosaValidation::applyLevelCheck(Operation *op) {
@@ -427,6 +438,69 @@ LogicalResult TosaValidation::applyLevelCheck(Operation *op) {
   return success();
 }
 
+inline bool CompatibleTypes(const mlir::Type &type,
+                            const mlir::Type &declared_type) {
+  // for now, simply use type equality comparison
+  return type == declared_type;
+}
+
+bool TosaValidation::CheckVariable(Operation *op) {
+  if (isa<mlir::tosa::VariableOp>(op)) {
+    auto name_attr = cast<mlir::StringAttr>(op->getAttr("name"));
+
+    if (variables_map.count(&name_attr)) {
+      op->emitOpError() << "name has already been declared";
+      return false;
+    }
+
+    auto type_attr = cast<mlir::TypeAttr>(op->getAttr("type"));
+    mlir::Type type = type_attr.getValue();
+
+    variables_map[&name_attr] = type;
+  }
+
+  return true;
+}
+
+bool TosaValidation::CheckVariableReadOrWrite(Operation *op) {
+  if (isa<mlir::tosa::VariableReadOp>(op) ||
+      isa<mlir::tosa::VariableWriteOp>(op)) {
+    auto name_attr = cast<mlir::StringAttr>(op->getAttr("name"));
+
+    if (!variables_map.count(&name_attr)) {
+      op->emitOpError() << "name has not been declared";
+      return false;
+    }
+
+    auto var_type = variables_map[&name_attr];
+
+    for (auto v : op->getOperands()) {
+      auto type = v.getType();
+      if (!CompatibleTypes(type, var_type)) {
+        op->emitOpError() << "operand type does not equal variable type";
+        return false;
+      }
+    }
+
+    for (auto v : op->getResults()) {
+      auto type = v.getType();
+      if (!CompatibleTypes(type, var_type)) {
+        op->emitOpError() << "result type does not equal variable type";
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+LogicalResult TosaValidation::applyVariableCheck(Operation *op) {
+  if (!CheckVariable(op) || !CheckVariableReadOrWrite(op)) {
+    return failure();
+  }
+  return success();
+}
+
 void TosaValidation::runOnOperation() {
   configLevelAndProfile();
   getOperation().walk([&](Operation *op) {
@@ -440,18 +514,18 @@ void TosaValidation::runOnOperation() {
       }
     }
 
-    // Some uses of TOSA rely on the constant operands of particular operations.
+    // Some uses of TOSA rely on the constant operands of particular
+    // operations.
     if (StrictOperationSpecAlignment && failed(applyConstantOperandCheck(op)))
       signalPassFailure();
 
     // do level checks
     if (failed(applyLevelCheck(op)))
       signalPassFailure();
+
+    // do variable type checks
+    if (failed(applyVariableCheck(op)))
+      signalPassFailure();
   });
 }
 } // namespace
-
-std::unique_ptr<Pass>
-mlir::tosa::createTosaValidationPass(ValidationOptions const &options) {
-  return std::make_unique<TosaValidation>(options);
-}
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index 7c58bb10b9c5ed6..9233662e88db902 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -203,3 +203,48 @@ func.func @test_avg_pool2d_zero_dim_input(%arg0: tensor<1x0x?x9xf32>) -> tensor<
       : (tensor<1x0x?x9xf32>) -> tensor<1x7x7x9xf32>
     return %0 : tensor<1x7x7x9xf32>
 }
+
+// -----
+
+func.func @test_variable_duplicates(%arg0: tensor<2x4x8xi32>) -> () {
+  tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi32>
+  // expected-error at +1 {{'tosa.variable' op name has already been declared}}
+  tosa.variable @stored_var : tensor<1x4x8xi32>
+  return
+}
+
+// -----
+
+func.func @test_variable_read_type(%arg0: tensor<2x4x8xi32>) -> () {
+  tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi32>
+  // expected-error at +1 {{'tosa.variable.read' op result type does not equal variable type}}
+  %0 = tosa.variable.read @stored_var : tensor<2x4x8xi16>
+  return
+}
+
+// -----
+
+func.func @test_variable_read_shape(%arg0: tensor<2x4x8xi32>) -> () {
+  tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi32>
+  // expected-error at +1 {{'tosa.variable.read' op result type does not equal variable type}}
+  %0 = tosa.variable.read @stored_var : tensor<1x4x8xi32>
+  return
+}
+
+// -----
+
+func.func @test_variable_write_type(%arg0: tensor<2x4x8xi16>) -> () {
+  tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi32>
+  // expected-error at +1 {{'tosa.variable.write' op operand type does not equal variable type}}
+  tosa.variable.write @stored_var, %arg0 : tensor<2x4x8xi16>
+  return
+}
+
+// -----
+
+func.func @test_variable_write_shape(%arg0: tensor<1x4x8xi32>) -> () {
+  tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi32>
+  // expected-error at +1 {{'tosa.variable.write' op operand type does not equal variable type}}
+  tosa.variable.write @stored_var, %arg0 : tensor<1x4x8xi32>
+  return
+}
diff --git a/mlir/test/Dialect/Tosa/variables.mlir b/mlir/test/Dialect/Tosa/variables.mlir
new file mode 100644
index 000000000000000..9a26aa0bc8bf4d5
--- /dev/null
+++ b/mlir/test/Dialect/Tosa/variables.mlir
@@ -0,0 +1,33 @@
+// RUN: mlir-opt %s | mlir-opt | FileCheck %s
+// RUN: mlir-opt %s --mlir-print-op-generic | mlir-opt | FileCheck %s
+
+
+// -----
+// CHECK-LABEL:   @test_variable_scalar(
+// CHECK-SAME:                        %[[ADD_VAL:.*]]: tensor<f32>) {
+func.func @test_variable_scalar(%arg0: tensor<f32>) -> () {
+  // CHECK:           tosa.variable @stored_var = dense<3.140000e+00> : tensor<f32>
+  tosa.variable @stored_var = dense<3.14> : tensor<f32>
+  // CHECK:           %[[STORED_VAL:.*]] = tosa.variable.read @stored_var : tensor<f32>
+  %0 = tosa.variable.read @stored_var : tensor<f32>
+  // CHECK:           %[[RESULT_ADD:.*]] = tosa.add %[[ADD_VAL]], %[[STORED_VAL]] : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %1 = "tosa.add"(%arg0, %0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK:           tosa.variable.write @stored_var, %[[RESULT_ADD]] : tensor<f32>
+  tosa.variable.write @stored_var, %1 : tensor<f32>
+  return
+}
+
+// -----
+// CHECK-LABEL:   @test_variable_tensor(
+// CHECK-SAME:                        %[[ADD_VAL:.*]]: tensor<2x4x8xi32>) {
+func.func @test_variable_tensor(%arg0: tensor<2x4x8xi32>) -> () {
+  // CHECK:           tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi32>
+  tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi32>
+  // CHECK:           %[[STORED_VAL:.*]] = tosa.variable.read @stored_var : tensor<2x4x8xi32>
+  %0 = tosa.variable.read @stored_var : tensor<2x4x8xi32>
+  // CHECK:           %[[RESULT_ADD:.*]] = tosa.add %[[ADD_VAL]], %[[STORED_VAL]] : (tensor<2x4x8xi32>, tensor<2x4x8xi32>) -> tensor<2x4x8xi32>
+  %1 = "tosa.add"(%arg0, %0) : (tensor<2x4x8xi32>, tensor<2x4x8xi32>) -> tensor<2x4x8xi32>
+  // CHECK:           tosa.variable.write @stored_var, %[[RESULT_ADD]] : tensor<2x4x8xi32>
+  tosa.variable.write @stored_var, %1 : tensor<2x4x8xi32>
+  return
+}

>From a6ebbb27796ca8ccbae773ddfd55e94596b6595b Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler at users.noreply.github.com>
Date: Mon, 16 Oct 2023 16:10:44 -0700
Subject: [PATCH 05/14] [flang][runtime] Fix edge cases with ROUND=UP/DOWN
 (#67508)

When an unrepresentable nonzero real input value with a very small
exponent is currently being read in as zero, don't neglect
ROUND=UP/DOWN; return the least nonzero subnormal value instead when
appropriate.
---
 flang/lib/Decimal/binary-to-decimal.cpp |  3 ++-
 flang/lib/Decimal/decimal-to-binary.cpp | 30 +++++++++++++++++--------
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/flang/lib/Decimal/binary-to-decimal.cpp b/flang/lib/Decimal/binary-to-decimal.cpp
index 7b31d02b292e48a..55fc548a6979bd3 100644
--- a/flang/lib/Decimal/binary-to-decimal.cpp
+++ b/flang/lib/Decimal/binary-to-decimal.cpp
@@ -373,7 +373,8 @@ STREAM &BigRadixFloatingPointNumber<PREC, LOG10RADIX>::Dump(STREAM &o) const {
   if (isNegative_) {
     o << '-';
   }
-  o << "10**(" << exponent_ << ") * ...\n";
+  o << "10**(" << exponent_ << ") * ...  (rounding "
+    << static_cast<int>(rounding_) << ")\n";
   for (int j{digits_}; --j >= 0;) {
     std::string str{std::to_string(digit_[j])};
     o << std::string(20 - str.size(), ' ') << str << " [" << j << ']';
diff --git a/flang/lib/Decimal/decimal-to-binary.cpp b/flang/lib/Decimal/decimal-to-binary.cpp
index c8c7b23329e00ce..d5b66b9fb933888 100644
--- a/flang/lib/Decimal/decimal-to-binary.cpp
+++ b/flang/lib/Decimal/decimal-to-binary.cpp
@@ -257,13 +257,20 @@ ConversionToBinaryResult<PREC> IntermediateFloat<PREC>::ToBinary(
     flags |= Inexact;
   }
   if (fraction == 0 && guard <= oneHalf) {
-    return {Binary{}, static_cast<enum ConversionResultFlags>(flags)};
-  }
-  // The value is nonzero; normalize it.
-  while (fraction < topBit && expo > 1) {
-    --expo;
-    fraction = fraction * 2 + (guard >> (guardBits - 2));
-    guard = (((guard >> (guardBits - 2)) & 1) << (guardBits - 1)) | (guard & 1);
+    if ((!isNegative && rounding == RoundUp) ||
+        (isNegative && rounding == RoundDown)) {
+      // round to minimum nonzero value
+    } else {
+      return {Binary{}, static_cast<enum ConversionResultFlags>(flags)};
+    }
+  } else {
+    // The value is nonzero; normalize it.
+    while (fraction < topBit && expo > 1) {
+      --expo;
+      fraction = fraction * 2 + (guard >> (guardBits - 2));
+      guard =
+          (((guard >> (guardBits - 2)) & 1) << (guardBits - 1)) | (guard & 1);
+    }
   }
   // Apply rounding
   bool incr{false};
@@ -330,8 +337,13 @@ BigRadixFloatingPointNumber<PREC, LOG10RADIX>::ConvertToBinary() {
   exponent_ += digits_ * log10Radix;
   // Sanity checks for ridiculous exponents
   static constexpr int crazy{2 * Real::decimalRange + log10Radix};
-  if (exponent_ < -crazy) { // underflow to +/-0.
-    return {Real{SignBit()}, Inexact};
+  if (exponent_ < -crazy) {
+    if ((!isNegative_ && rounding_ == RoundUp) ||
+        (isNegative_ && rounding_ == RoundDown)) {
+      return {Real{Raw{1} | SignBit()}}; // return least nonzero value
+    } else { // underflow to +/-0.
+      return {Real{SignBit()}, Inexact};
+    }
   } else if (exponent_ > crazy) { // overflow to +/-Inf.
     return {Real{Infinity()}, Overflow};
   }

>From 33ac6a9483b77c47a2a97a676ea37340d86ae35f Mon Sep 17 00:00:00 2001
From: Alexander Shaposhnikov
 <6532716+alexander-shaposhnikov at users.noreply.github.com>
Date: Mon, 16 Oct 2023 16:12:33 -0700
Subject: [PATCH 06/14] [compiler-rt] Implement __extendxftf2 and __trunctfxf2
 for x86_64 (#66918)

This patch implements __extendxftf2 (long double -> f128) and
__trunctfxf2 (f128 -> long double) on x86_64.
This is a preparation to unblock https://reviews.llvm.org/D53608,
We intentionally do not modify compiler-rt/lib/builtins/fp_lib.h in this
PR
(in particular, to limit the scope and avoid exposing other functions on
X86_64 in this PR).
Instead, TODOs were added to use fp_lib.h once it is available.

Test plan:
1. ninja check-compiler-rt (verified on X86_64 and on Aarch64)
In particular, new tests (extendxftf2_test.c and trunctfxf2_test.c) were
added.
2. compared the results of conversions with what other compilers (gcc)
produce.
---
 compiler-rt/lib/builtins/CMakeLists.txt       |   2 +
 compiler-rt/lib/builtins/extendxftf2.c        |  23 ++++
 compiler-rt/lib/builtins/fp_extend.h          |  92 +++++++++++--
 compiler-rt/lib/builtins/fp_extend_impl.inc   |  83 ++++++------
 compiler-rt/lib/builtins/fp_trunc.h           |  83 ++++++++++--
 compiler-rt/lib/builtins/fp_trunc_impl.inc    | 122 ++++++++++--------
 compiler-rt/lib/builtins/trunctfxf2.c         |  24 ++++
 compiler-rt/test/builtins/Unit/addtf3_test.c  |   2 +-
 compiler-rt/test/builtins/Unit/divtf3_test.c  |   2 +-
 .../test/builtins/Unit/extenddftf2_test.c     |   2 +-
 .../test/builtins/Unit/extendhftf2_test.c     |   2 +-
 .../test/builtins/Unit/extendsftf2_test.c     |   2 +-
 .../test/builtins/Unit/extendxftf2_test.c     |  74 +++++++++++
 .../test/builtins/Unit/floatditf_test.c       |   2 +-
 .../test/builtins/Unit/floatsitf_test.c       |   2 +-
 .../test/builtins/Unit/floatunditf_test.c     |   2 +-
 .../test/builtins/Unit/floatunsitf_test.c     |   2 +-
 compiler-rt/test/builtins/Unit/fp_test.h      |  93 +++++++++----
 compiler-rt/test/builtins/Unit/multf3_test.c  |   2 +-
 compiler-rt/test/builtins/Unit/subtf3_test.c  |   2 +-
 .../test/builtins/Unit/trunctfxf2_test.c      |  97 ++++++++++++++
 21 files changed, 564 insertions(+), 151 deletions(-)
 create mode 100644 compiler-rt/lib/builtins/extendxftf2.c
 create mode 100644 compiler-rt/lib/builtins/trunctfxf2.c
 create mode 100644 compiler-rt/test/builtins/Unit/extendxftf2_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/trunctfxf2_test.c

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 753d08273ea5472..4f210a5c0fef90f 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -280,6 +280,7 @@ endif ()
 # long double is not 80 bits on Android or MSVC.
 set(x86_80_BIT_SOURCES
   divxc3.c
+  extendxftf2.c
   fixxfdi.c
   fixxfti.c
   fixunsxfdi.c
@@ -291,6 +292,7 @@ set(x86_80_BIT_SOURCES
   floatuntixf.c
   mulxc3.c
   powixf2.c
+  trunctfxf2.c
 )
 
 if (NOT MSVC)
diff --git a/compiler-rt/lib/builtins/extendxftf2.c b/compiler-rt/lib/builtins/extendxftf2.c
new file mode 100644
index 000000000000000..20911fe7cf2a000
--- /dev/null
+++ b/compiler-rt/lib/builtins/extendxftf2.c
@@ -0,0 +1,23 @@
+//===-- lib/extendxftf2.c - long double -> quad conversion --------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Assumption: long double is a IEEE 80 bit floating point type padded to 128
+// bits.
+
+// TODO: use fp_lib.h once QUAD_PRECISION is available on x86_64.
+#if __LDBL_MANT_DIG__ == 64 && defined(__x86_64__) &&                          \
+    (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__))
+#define SRC_80
+#define DST_QUAD
+#include "fp_extend_impl.inc"
+
+COMPILER_RT_ABI __float128 __extendxftf2(long double a) {
+  return __extendXfYf2__(a);
+}
+
+#endif
diff --git a/compiler-rt/lib/builtins/fp_extend.h b/compiler-rt/lib/builtins/fp_extend.h
index eee4722bf90e69f..86b32be12d55fc3 100644
--- a/compiler-rt/lib/builtins/fp_extend.h
+++ b/compiler-rt/lib/builtins/fp_extend.h
@@ -20,15 +20,22 @@
 typedef float src_t;
 typedef uint32_t src_rep_t;
 #define SRC_REP_C UINT32_C
-static const int srcSigBits = 23;
+static const int srcBits = sizeof(src_t) * CHAR_BIT;
+static const int srcSigFracBits = 23;
+// -1 accounts for the sign bit.
+static const int srcExpBits = srcBits - srcSigFracBits - 1;
 #define src_rep_t_clz clzsi
 
 #elif defined SRC_DOUBLE
 typedef double src_t;
 typedef uint64_t src_rep_t;
 #define SRC_REP_C UINT64_C
-static const int srcSigBits = 52;
-static __inline int src_rep_t_clz(src_rep_t a) {
+static const int srcBits = sizeof(src_t) * CHAR_BIT;
+static const int srcSigFracBits = 52;
+// -1 accounts for the sign bit.
+static const int srcExpBits = srcBits - srcSigFracBits - 1;
+
+static inline int src_rep_t_clz_impl(src_rep_t a) {
 #if defined __LP64__
   return __builtin_clzl(a);
 #else
@@ -38,6 +45,18 @@ static __inline int src_rep_t_clz(src_rep_t a) {
     return 32 + clzsi(a & REP_C(0xffffffff));
 #endif
 }
+#define src_rep_t_clz src_rep_t_clz_impl
+
+#elif defined SRC_80
+typedef long double src_t;
+typedef __uint128_t src_rep_t;
+#define SRC_REP_C (__uint128_t)
+// sign bit, exponent and significand occupy the lower 80 bits.
+static const int srcBits = 80;
+static const int srcSigFracBits = 63;
+// -1 accounts for the sign bit.
+// -1 accounts for the explicitly stored integer bit.
+static const int srcExpBits = srcBits - srcSigFracBits - 1 - 1;
 
 #elif defined SRC_HALF
 #ifdef COMPILER_RT_HAS_FLOAT16
@@ -47,7 +66,11 @@ typedef uint16_t src_t;
 #endif
 typedef uint16_t src_rep_t;
 #define SRC_REP_C UINT16_C
-static const int srcSigBits = 10;
+static const int srcBits = sizeof(src_t) * CHAR_BIT;
+static const int srcSigFracBits = 10;
+// -1 accounts for the sign bit.
+static const int srcExpBits = srcBits - srcSigFracBits - 1;
+
 #define src_rep_t_clz __builtin_clz
 
 #else
@@ -58,28 +81,75 @@ static const int srcSigBits = 10;
 typedef float dst_t;
 typedef uint32_t dst_rep_t;
 #define DST_REP_C UINT32_C
-static const int dstSigBits = 23;
+static const int dstBits = sizeof(dst_t) * CHAR_BIT;
+static const int dstSigFracBits = 23;
+// -1 accounts for the sign bit.
+static const int dstExpBits = dstBits - dstSigFracBits - 1;
 
 #elif defined DST_DOUBLE
 typedef double dst_t;
 typedef uint64_t dst_rep_t;
 #define DST_REP_C UINT64_C
-static const int dstSigBits = 52;
+static const int dstBits = sizeof(dst_t) * CHAR_BIT;
+static const int dstSigFracBits = 52;
+// -1 accounts for the sign bit.
+static const int dstExpBits = dstBits - dstSigFracBits - 1;
 
 #elif defined DST_QUAD
+// TODO: use fp_lib.h once QUAD_PRECISION is available on x86_64.
+#if __LDBL_MANT_DIG__ == 113
 typedef long double dst_t;
+#elif defined(__x86_64__) &&                                                   \
+    (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__))
+typedef __float128 dst_t;
+#endif
 typedef __uint128_t dst_rep_t;
 #define DST_REP_C (__uint128_t)
-static const int dstSigBits = 112;
+static const int dstBits = sizeof(dst_t) * CHAR_BIT;
+static const int dstSigFracBits = 112;
+// -1 accounts for the sign bit.
+static const int dstExpBits = dstBits - dstSigFracBits - 1;
 
 #else
 #error Destination should be single, double, or quad precision!
 #endif // end destination precision
 
-// End of specialization parameters.  Two helper routines for conversion to and
-// from the representation of floating-point data as integer values follow.
+// End of specialization parameters.
+
+// TODO: These helper routines should be placed into fp_lib.h
+// Currently they depend on macros/constants defined above.
+
+static inline src_rep_t extract_sign_from_src(src_rep_t x) {
+  const src_rep_t srcSignMask = SRC_REP_C(1) << (srcBits - 1);
+  return (x & srcSignMask) >> (srcBits - 1);
+}
+
+static inline src_rep_t extract_exp_from_src(src_rep_t x) {
+  const int srcSigBits = srcBits - 1 - srcExpBits;
+  const src_rep_t srcExpMask = ((SRC_REP_C(1) << srcExpBits) - 1) << srcSigBits;
+  return (x & srcExpMask) >> srcSigBits;
+}
+
+static inline src_rep_t extract_sig_frac_from_src(src_rep_t x) {
+  const src_rep_t srcSigFracMask = (SRC_REP_C(1) << srcSigFracBits) - 1;
+  return x & srcSigFracMask;
+}
+
+#ifdef src_rep_t_clz
+static inline int clz_in_sig_frac(src_rep_t sigFrac) {
+      const int skip = (sizeof(dst_t) * CHAR_BIT - srcBits) + 1 + srcExpBits;
+      return src_rep_t_clz(sigFrac) - skip;
+}
+#endif
+
+static inline dst_rep_t construct_dst_rep(dst_rep_t sign, dst_rep_t exp, dst_rep_t sigFrac) {
+  return (sign << (dstBits - 1)) | (exp << (dstBits - 1 - dstExpBits)) | sigFrac;
+}
+
+// Two helper routines for conversion to and from the representation of
+// floating-point data as integer values follow.
 
-static __inline src_rep_t srcToRep(src_t x) {
+static inline src_rep_t srcToRep(src_t x) {
   const union {
     src_t f;
     src_rep_t i;
@@ -87,7 +157,7 @@ static __inline src_rep_t srcToRep(src_t x) {
   return rep.i;
 }
 
-static __inline dst_t dstFromRep(dst_rep_t x) {
+static inline dst_t dstFromRep(dst_rep_t x) {
   const union {
     dst_t f;
     dst_rep_t i;
diff --git a/compiler-rt/lib/builtins/fp_extend_impl.inc b/compiler-rt/lib/builtins/fp_extend_impl.inc
index d1c9c02a00c5314..e16b55d150d2eff 100644
--- a/compiler-rt/lib/builtins/fp_extend_impl.inc
+++ b/compiler-rt/lib/builtins/fp_extend_impl.inc
@@ -37,71 +37,72 @@
 
 #include "fp_extend.h"
 
+// The source type may use a usual IEEE-754 interchange format or Intel 80-bit
+// format. In particular, for the source type srcSigFracBits may be not equal to
+// srcSigBits. The destination type is assumed to be one of IEEE-754 standard
+// types.
 static __inline dst_t __extendXfYf2__(src_t a) {
   // Various constants whose values follow from the type parameters.
   // Any reasonable optimizer will fold and propagate all of these.
-  const int srcBits = sizeof(src_t) * CHAR_BIT;
-  const int srcExpBits = srcBits - srcSigBits - 1;
   const int srcInfExp = (1 << srcExpBits) - 1;
   const int srcExpBias = srcInfExp >> 1;
 
-  const src_rep_t srcMinNormal = SRC_REP_C(1) << srcSigBits;
-  const src_rep_t srcInfinity = (src_rep_t)srcInfExp << srcSigBits;
-  const src_rep_t srcSignMask = SRC_REP_C(1) << (srcSigBits + srcExpBits);
-  const src_rep_t srcAbsMask = srcSignMask - 1;
-  const src_rep_t srcQNaN = SRC_REP_C(1) << (srcSigBits - 1);
-  const src_rep_t srcNaNCode = srcQNaN - 1;
-
-  const int dstBits = sizeof(dst_t) * CHAR_BIT;
-  const int dstExpBits = dstBits - dstSigBits - 1;
   const int dstInfExp = (1 << dstExpBits) - 1;
   const int dstExpBias = dstInfExp >> 1;
 
-  const dst_rep_t dstMinNormal = DST_REP_C(1) << dstSigBits;
-
   // Break a into a sign and representation of the absolute value.
   const src_rep_t aRep = srcToRep(a);
-  const src_rep_t aAbs = aRep & srcAbsMask;
-  const src_rep_t sign = aRep & srcSignMask;
-  dst_rep_t absResult;
+  const src_rep_t srcSign = extract_sign_from_src(aRep);
+  const src_rep_t srcExp = extract_exp_from_src(aRep);
+  const src_rep_t srcSigFrac = extract_sig_frac_from_src(aRep);
+
+  dst_rep_t dstSign = srcSign;
+  dst_rep_t dstExp;
+  dst_rep_t dstSigFrac;
 
-  // If sizeof(src_rep_t) < sizeof(int), the subtraction result is promoted
-  // to (signed) int.  To avoid that, explicitly cast to src_rep_t.
-  if ((src_rep_t)(aAbs - srcMinNormal) < srcInfinity - srcMinNormal) {
+  if (srcExp >= 1 && srcExp < srcInfExp) {
     // a is a normal number.
-    // Extend to the destination type by shifting the significand and
-    // exponent into the proper position and rebiasing the exponent.
-    absResult = (dst_rep_t)aAbs << (dstSigBits - srcSigBits);
-    absResult += (dst_rep_t)(dstExpBias - srcExpBias) << dstSigBits;
+    dstExp = (dst_rep_t)srcExp + (dst_rep_t)(dstExpBias - srcExpBias);
+    dstSigFrac = (dst_rep_t)srcSigFrac << (dstSigFracBits - srcSigFracBits);
   }
 
-  else if (aAbs >= srcInfinity) {
+  else if (srcExp == srcInfExp) {
     // a is NaN or infinity.
-    // Conjure the result by beginning with infinity, then setting the qNaN
-    // bit (if needed) and right-aligning the rest of the trailing NaN
-    // payload field.
-    absResult = (dst_rep_t)dstInfExp << dstSigBits;
-    absResult |= (dst_rep_t)(aAbs & srcQNaN) << (dstSigBits - srcSigBits);
-    absResult |= (dst_rep_t)(aAbs & srcNaNCode) << (dstSigBits - srcSigBits);
+    dstExp = dstInfExp;
+    dstSigFrac = (dst_rep_t)srcSigFrac << (dstSigFracBits - srcSigFracBits);
   }
 
-  else if (aAbs) {
+  else if (srcSigFrac) {
     // a is denormal.
-    // renormalize the significand and clear the leading bit, then insert
-    // the correct adjusted exponent in the destination type.
-    const int scale = src_rep_t_clz(aAbs) - src_rep_t_clz(srcMinNormal);
-    absResult = (dst_rep_t)aAbs << (dstSigBits - srcSigBits + scale);
-    absResult ^= dstMinNormal;
-    const int resultExponent = dstExpBias - srcExpBias - scale + 1;
-    absResult |= (dst_rep_t)resultExponent << dstSigBits;
+    if (srcExpBits == dstExpBits) {
+      // The exponent fields are identical and this is a denormal number, so all
+      // the non-significand bits are zero. In particular, this branch is always
+      // taken when we extend a denormal F80 to F128.
+      dstExp = 0;
+      dstSigFrac = ((dst_rep_t)srcSigFrac) << (dstSigFracBits - srcSigFracBits);
+    } else {
+#ifndef src_rep_t_clz
+      // If src_rep_t_clz is not defined this branch must be unreachable.
+      __builtin_unreachable();
+#else
+      // Renormalize the significand and clear the leading bit.
+      // For F80 -> F128 this codepath is unused.
+      const int scale = clz_in_sig_frac(srcSigFrac) + 1;
+      dstExp = dstExpBias - srcExpBias - scale + 1;
+      dstSigFrac = (dst_rep_t)srcSigFrac
+                   << (dstSigFracBits - srcSigFracBits + scale);
+      const dst_rep_t dstMinNormal = DST_REP_C(1) << (dstBits - 1 - dstExpBits);
+      dstSigFrac ^= dstMinNormal;
+#endif
+    }
   }
 
   else {
     // a is zero.
-    absResult = 0;
+    dstExp = 0;
+    dstSigFrac = 0;
   }
 
-  // Apply the signbit to the absolute value.
-  const dst_rep_t result = absResult | (dst_rep_t)sign << (dstBits - srcBits);
+  const dst_rep_t result = construct_dst_rep(dstSign, dstExp, dstSigFrac);
   return dstFromRep(result);
 }
diff --git a/compiler-rt/lib/builtins/fp_trunc.h b/compiler-rt/lib/builtins/fp_trunc.h
index 91f614528ab3f42..ea13dc2efae5411 100644
--- a/compiler-rt/lib/builtins/fp_trunc.h
+++ b/compiler-rt/lib/builtins/fp_trunc.h
@@ -19,19 +19,34 @@
 typedef float src_t;
 typedef uint32_t src_rep_t;
 #define SRC_REP_C UINT32_C
-static const int srcSigBits = 23;
+static const int srcBits = sizeof(src_t) * CHAR_BIT;
+static const int srcSigFracBits = 23;
+// -1 accounts for the sign bit.
+static const int srcExpBits = srcBits - srcSigFracBits - 1;
 
 #elif defined SRC_DOUBLE
 typedef double src_t;
 typedef uint64_t src_rep_t;
 #define SRC_REP_C UINT64_C
-static const int srcSigBits = 52;
+static const int srcBits = sizeof(src_t) * CHAR_BIT;
+static const int srcSigFracBits = 52;
+// -1 accounts for the sign bit.
+static const int srcExpBits = srcBits - srcSigFracBits - 1;
 
 #elif defined SRC_QUAD
+// TODO: use fp_lib.h once QUAD_PRECISION is available on x86_64.
+#if __LDBL_MANT_DIG__ == 113
 typedef long double src_t;
+#elif defined(__x86_64__) &&                                                   \
+    (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__))
+typedef __float128 src_t;
+#endif
 typedef __uint128_t src_rep_t;
 #define SRC_REP_C (__uint128_t)
-static const int srcSigBits = 112;
+static const int srcBits = sizeof(src_t) * CHAR_BIT;
+static const int srcSigFracBits = 112;
+// -1 accounts for the sign bit.
+static const int srcExpBits = srcBits - srcSigFracBits - 1;
 
 #else
 #error Source should be double precision or quad precision!
@@ -41,13 +56,29 @@ static const int srcSigBits = 112;
 typedef double dst_t;
 typedef uint64_t dst_rep_t;
 #define DST_REP_C UINT64_C
-static const int dstSigBits = 52;
+static const int dstBits = sizeof(dst_t) * CHAR_BIT;
+static const int dstSigFracBits = 52;
+// -1 accounts for the sign bit.
+static const int dstExpBits = dstBits - dstSigFracBits - 1;
+
+#elif defined DST_80
+typedef long double dst_t;
+typedef __uint128_t dst_rep_t;
+#define DST_REP_C (__uint128_t)
+static const int dstBits = 80;
+static const int dstSigFracBits = 63;
+// -1 accounts for the sign bit.
+// -1 accounts for the explicitly stored integer bit.
+static const int dstExpBits = dstBits - dstSigFracBits - 1 - 1;
 
 #elif defined DST_SINGLE
 typedef float dst_t;
 typedef uint32_t dst_rep_t;
 #define DST_REP_C UINT32_C
-static const int dstSigBits = 23;
+static const int dstBits = sizeof(dst_t) * CHAR_BIT;
+static const int dstSigFracBits = 23;
+// -1 accounts for the sign bit.
+static const int dstExpBits = dstBits - dstSigFracBits - 1;
 
 #elif defined DST_HALF
 #ifdef COMPILER_RT_HAS_FLOAT16
@@ -57,22 +88,56 @@ typedef uint16_t dst_t;
 #endif
 typedef uint16_t dst_rep_t;
 #define DST_REP_C UINT16_C
-static const int dstSigBits = 10;
+static const int dstBits = sizeof(dst_t) * CHAR_BIT;
+static const int dstSigFracBits = 10;
+// -1 accounts for the sign bit.
+static const int dstExpBits = dstBits - dstSigFracBits - 1;
 
 #elif defined DST_BFLOAT
 typedef __bf16 dst_t;
 typedef uint16_t dst_rep_t;
 #define DST_REP_C UINT16_C
-static const int dstSigBits = 7;
+static const int dstBits = sizeof(dst_t) * CHAR_BIT;
+static const int dstSigFracBits = 7;
+// -1 accounts for the sign bit.
+static const int dstExpBits = dstBits - dstSigFracBits - 1;
 
 #else
 #error Destination should be single precision or double precision!
 #endif // end destination precision
 
+// TODO: These helper routines should be placed into fp_lib.h
+// Currently they depend on macros/constants defined above.
+
+static inline src_rep_t extract_sign_from_src(src_rep_t x) {
+  const src_rep_t srcSignMask = SRC_REP_C(1) << (srcBits - 1);
+  return (x & srcSignMask) >> (srcBits - 1);
+}
+
+static inline src_rep_t extract_exp_from_src(src_rep_t x) {
+  const int srcSigBits = srcBits - 1 - srcExpBits;
+  const src_rep_t srcExpMask = ((SRC_REP_C(1) << srcExpBits) - 1) << srcSigBits;
+  return (x & srcExpMask) >> srcSigBits;
+}
+
+static inline src_rep_t extract_sig_frac_from_src(src_rep_t x) {
+  const src_rep_t srcSigFracMask = (SRC_REP_C(1) << srcSigFracBits) - 1;
+  return x & srcSigFracMask;
+}
+
+static inline dst_rep_t construct_dst_rep(dst_rep_t sign, dst_rep_t exp, dst_rep_t sigFrac) {
+  dst_rep_t result = (sign << (dstBits - 1)) | (exp << (dstBits - 1 - dstExpBits)) | sigFrac;
+  // Set the explicit integer bit in F80 if present.
+  if (dstBits == 80 && exp) {
+    result |= (DST_REP_C(1) << dstSigFracBits);
+  }
+  return result;
+}
+
 // End of specialization parameters.  Two helper routines for conversion to and
 // from the representation of floating-point data as integer values follow.
 
-static __inline src_rep_t srcToRep(src_t x) {
+static inline src_rep_t srcToRep(src_t x) {
   const union {
     src_t f;
     src_rep_t i;
@@ -80,7 +145,7 @@ static __inline src_rep_t srcToRep(src_t x) {
   return rep.i;
 }
 
-static __inline dst_t dstFromRep(dst_rep_t x) {
+static inline dst_t dstFromRep(dst_rep_t x) {
   const union {
     dst_t f;
     dst_rep_t i;
diff --git a/compiler-rt/lib/builtins/fp_trunc_impl.inc b/compiler-rt/lib/builtins/fp_trunc_impl.inc
index e235f45965a7276..f68492495697f9a 100644
--- a/compiler-rt/lib/builtins/fp_trunc_impl.inc
+++ b/compiler-rt/lib/builtins/fp_trunc_impl.inc
@@ -38,102 +38,118 @@
 
 #include "fp_trunc.h"
 
+// The destination type may use a usual IEEE-754 interchange format or Intel
+// 80-bit format. In particular, for the destination type dstSigFracBits may be
+// not equal to dstSigBits. The source type is assumed to be one of IEEE-754
+// standard types.
 static __inline dst_t __truncXfYf2__(src_t a) {
   // Various constants whose values follow from the type parameters.
   // Any reasonable optimizer will fold and propagate all of these.
-  const int srcBits = sizeof(src_t) * CHAR_BIT;
-  const int srcExpBits = srcBits - srcSigBits - 1;
   const int srcInfExp = (1 << srcExpBits) - 1;
   const int srcExpBias = srcInfExp >> 1;
 
-  const src_rep_t srcMinNormal = SRC_REP_C(1) << srcSigBits;
-  const src_rep_t srcSignificandMask = srcMinNormal - 1;
-  const src_rep_t srcInfinity = (src_rep_t)srcInfExp << srcSigBits;
-  const src_rep_t srcSignMask = SRC_REP_C(1) << (srcSigBits + srcExpBits);
-  const src_rep_t srcAbsMask = srcSignMask - 1;
-  const src_rep_t roundMask = (SRC_REP_C(1) << (srcSigBits - dstSigBits)) - 1;
-  const src_rep_t halfway = SRC_REP_C(1) << (srcSigBits - dstSigBits - 1);
-  const src_rep_t srcQNaN = SRC_REP_C(1) << (srcSigBits - 1);
+  const src_rep_t srcMinNormal = SRC_REP_C(1) << srcSigFracBits;
+  const src_rep_t roundMask =
+      (SRC_REP_C(1) << (srcSigFracBits - dstSigFracBits)) - 1;
+  const src_rep_t halfway = SRC_REP_C(1)
+                            << (srcSigFracBits - dstSigFracBits - 1);
+  const src_rep_t srcQNaN = SRC_REP_C(1) << (srcSigFracBits - 1);
   const src_rep_t srcNaNCode = srcQNaN - 1;
 
-  const int dstBits = sizeof(dst_t) * CHAR_BIT;
-  const int dstExpBits = dstBits - dstSigBits - 1;
   const int dstInfExp = (1 << dstExpBits) - 1;
   const int dstExpBias = dstInfExp >> 1;
-
-  const int underflowExponent = srcExpBias + 1 - dstExpBias;
   const int overflowExponent = srcExpBias + dstInfExp - dstExpBias;
-  const src_rep_t underflow = (src_rep_t)underflowExponent << srcSigBits;
-  const src_rep_t overflow = (src_rep_t)overflowExponent << srcSigBits;
 
-  const dst_rep_t dstQNaN = DST_REP_C(1) << (dstSigBits - 1);
+  const dst_rep_t dstQNaN = DST_REP_C(1) << (dstSigFracBits - 1);
   const dst_rep_t dstNaNCode = dstQNaN - 1;
 
-  // Break a into a sign and representation of the absolute value.
   const src_rep_t aRep = srcToRep(a);
-  const src_rep_t aAbs = aRep & srcAbsMask;
-  const src_rep_t sign = aRep & srcSignMask;
-  dst_rep_t absResult;
+  const src_rep_t srcSign = extract_sign_from_src(aRep);
+  const src_rep_t srcExp = extract_exp_from_src(aRep);
+  const src_rep_t srcSigFrac = extract_sig_frac_from_src(aRep);
+
+  dst_rep_t dstSign = srcSign;
+  dst_rep_t dstExp;
+  dst_rep_t dstSigFrac;
 
-  const int tailBits = srcBits - dstBits;
-  if (srcExpBits == dstExpBits && ((aRep >> tailBits) << tailBits) == aRep) {
-    // Same size exponents and a's significand tail is 0. Remove tail.
-    dst_rep_t result = aRep >> tailBits;
-    return dstFromRep(result);
+  // Same size exponents and a's significand tail is 0.
+  // The significand can be truncated and the exponent can be copied over.
+  const int sigFracTailBits = srcSigFracBits - dstSigFracBits;
+  if (srcExpBits == dstExpBits &&
+      ((aRep >> sigFracTailBits) << sigFracTailBits) == aRep) {
+    dstExp = srcExp;
+    dstSigFrac = (dst_rep_t)(srcSigFrac >> sigFracTailBits);
+    return dstFromRep(construct_dst_rep(dstSign, dstExp, dstSigFrac));
   }
 
-  if (aAbs - underflow < aAbs - overflow) {
+  const int dstExpCandidate = ((int)srcExp - srcExpBias) + dstExpBias;
+  if (dstExpCandidate >= 1 && dstExpCandidate < dstInfExp) {
     // The exponent of a is within the range of normal numbers in the
-    // destination format.  We can convert by simply right-shifting with
+    // destination format. We can convert by simply right-shifting with
     // rounding and adjusting the exponent.
-    absResult = aAbs >> (srcSigBits - dstSigBits);
-    absResult -= (dst_rep_t)(srcExpBias - dstExpBias) << dstSigBits;
+    dstExp = dstExpCandidate;
+    dstSigFrac = (dst_rep_t)(srcSigFrac >> sigFracTailBits);
 
-    const src_rep_t roundBits = aAbs & roundMask;
+    const src_rep_t roundBits = srcSigFrac & roundMask;
     // Round to nearest.
     if (roundBits > halfway)
-      absResult++;
+      dstSigFrac++;
     // Tie to even.
     else if (roundBits == halfway)
-      absResult += absResult & 1;
-  } else if (aAbs > srcInfinity) {
+      dstSigFrac += dstSigFrac & 1;
+
+    // Rounding has changed the exponent.
+    if (dstSigFrac >= (DST_REP_C(1) << dstSigFracBits)) {
+      dstExp += 1;
+      dstSigFrac ^= (DST_REP_C(1) << dstSigFracBits);
+    }
+  } else if (srcExp == srcInfExp && srcSigFrac) {
     // a is NaN.
     // Conjure the result by beginning with infinity, setting the qNaN
     // bit and inserting the (truncated) trailing NaN field.
-    absResult = (dst_rep_t)dstInfExp << dstSigBits;
-    absResult |= dstQNaN;
-    absResult |=
-        ((aAbs & srcNaNCode) >> (srcSigBits - dstSigBits)) & dstNaNCode;
-  } else if (aAbs >= overflow) {
-    // a overflows to infinity.
-    absResult = (dst_rep_t)dstInfExp << dstSigBits;
+    dstExp = dstInfExp;
+    dstSigFrac = dstQNaN;
+    dstSigFrac |= ((srcSigFrac & srcNaNCode) >> sigFracTailBits) & dstNaNCode;
+  } else if ((int)srcExp >= overflowExponent) {
+    dstExp = dstInfExp;
+    dstSigFrac = 0;
   } else {
     // a underflows on conversion to the destination type or is an exact
     // zero.  The result may be a denormal or zero.  Extract the exponent
     // to get the shift amount for the denormalization.
-    const int aExp = aAbs >> srcSigBits;
-    const int shift = srcExpBias - dstExpBias - aExp + 1;
+    src_rep_t significand = srcSigFrac;
+    int shift = srcExpBias - dstExpBias - srcExp;
 
-    const src_rep_t significand = (aRep & srcSignificandMask) | srcMinNormal;
+    if (srcExp) {
+      // Set the implicit integer bit if the source is a normal number.
+      significand |= srcMinNormal;
+      shift += 1;
+    }
 
     // Right shift by the denormalization amount with sticky.
-    if (shift > srcSigBits) {
-      absResult = 0;
+    if (shift > srcSigFracBits) {
+      dstExp = 0;
+      dstSigFrac = 0;
     } else {
-      const bool sticky = (significand << (srcBits - shift)) != 0;
+      dstExp = 0;
+      const bool sticky = shift && ((significand << (srcBits - shift)) != 0);
       src_rep_t denormalizedSignificand = significand >> shift | sticky;
-      absResult = denormalizedSignificand >> (srcSigBits - dstSigBits);
+      dstSigFrac = denormalizedSignificand >> sigFracTailBits;
       const src_rep_t roundBits = denormalizedSignificand & roundMask;
       // Round to nearest
       if (roundBits > halfway)
-        absResult++;
+        dstSigFrac++;
       // Ties to even
       else if (roundBits == halfway)
-        absResult += absResult & 1;
+        dstSigFrac += dstSigFrac & 1;
+
+      // Rounding has changed the exponent.
+      if (dstSigFrac >= (DST_REP_C(1) << dstSigFracBits)) {
+        dstExp += 1;
+        dstSigFrac ^= (DST_REP_C(1) << dstSigFracBits);
+      }
     }
   }
 
-  // Apply the signbit to the absolute value.
-  const dst_rep_t result = absResult | sign >> (srcBits - dstBits);
-  return dstFromRep(result);
+  return dstFromRep(construct_dst_rep(dstSign, dstExp, dstSigFrac));
 }
diff --git a/compiler-rt/lib/builtins/trunctfxf2.c b/compiler-rt/lib/builtins/trunctfxf2.c
new file mode 100644
index 000000000000000..4a22a602b38173f
--- /dev/null
+++ b/compiler-rt/lib/builtins/trunctfxf2.c
@@ -0,0 +1,24 @@
+//===-- lib/trunctfsf2.c - long double -> quad conversion ---------*- C -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Assumption: long double is a IEEE 80 bit floating point type padded to 128
+// bits.
+
+// TODO: use fp_lib.h once QUAD_PRECISION is available on x86_64.
+#if __LDBL_MANT_DIG__ == 64 && defined(__x86_64__) &&                          \
+    (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__))
+
+#define SRC_QUAD
+#define DST_80
+#include "fp_trunc_impl.inc"
+
+COMPILER_RT_ABI long double __trunctfxf2(__float128 a) {
+  return __truncXfYf2__(a);
+}
+
+#endif
diff --git a/compiler-rt/test/builtins/Unit/addtf3_test.c b/compiler-rt/test/builtins/Unit/addtf3_test.c
index fe2e2c80f655b7a..e6986c236a64f5e 100644
--- a/compiler-rt/test/builtins/Unit/addtf3_test.c
+++ b/compiler-rt/test/builtins/Unit/addtf3_test.c
@@ -16,7 +16,7 @@ int test__addtf3(long double a, long double b,
                  uint64_t expectedHi, uint64_t expectedLo)
 {
     long double x = __addtf3(a, b);
-    int ret = compareResultLD(x, expectedHi, expectedLo);
+    int ret = compareResultF128(x, expectedHi, expectedLo);
 
     if (ret){
         printf("error in test__addtf3(%.20Lf, %.20Lf) = %.20Lf, "
diff --git a/compiler-rt/test/builtins/Unit/divtf3_test.c b/compiler-rt/test/builtins/Unit/divtf3_test.c
index 927d0b826f8f57c..da6465636e92326 100644
--- a/compiler-rt/test/builtins/Unit/divtf3_test.c
+++ b/compiler-rt/test/builtins/Unit/divtf3_test.c
@@ -15,7 +15,7 @@ int test__divtf3(long double a, long double b,
                  uint64_t expectedHi, uint64_t expectedLo)
 {
     long double x = __divtf3(a, b);
-    int ret = compareResultLD(x, expectedHi, expectedLo);
+    int ret = compareResultF128(x, expectedHi, expectedLo);
 
     if (ret){
         printf("error in test__divtf3(%.20Le, %.20Le) = %.20Le, "
diff --git a/compiler-rt/test/builtins/Unit/extenddftf2_test.c b/compiler-rt/test/builtins/Unit/extenddftf2_test.c
index 04a346887661bf8..fcc030ca92202e9 100644
--- a/compiler-rt/test/builtins/Unit/extenddftf2_test.c
+++ b/compiler-rt/test/builtins/Unit/extenddftf2_test.c
@@ -13,7 +13,7 @@ COMPILER_RT_ABI long double __extenddftf2(double a);
 int test__extenddftf2(double a, uint64_t expectedHi, uint64_t expectedLo)
 {
     long double x = __extenddftf2(a);
-    int ret = compareResultLD(x, expectedHi, expectedLo);
+    int ret = compareResultF128(x, expectedHi, expectedLo);
 
     if (ret){
         printf("error in test__extenddftf2(%f) = %.20Lf, "
diff --git a/compiler-rt/test/builtins/Unit/extendhftf2_test.c b/compiler-rt/test/builtins/Unit/extendhftf2_test.c
index 7d3ea3049e8a195..5de17379093af18 100644
--- a/compiler-rt/test/builtins/Unit/extendhftf2_test.c
+++ b/compiler-rt/test/builtins/Unit/extendhftf2_test.c
@@ -12,7 +12,7 @@ COMPILER_RT_ABI long double __extendhftf2(TYPE_FP16 a);
 
 int test__extendhftf2(TYPE_FP16 a, uint64_t expectedHi, uint64_t expectedLo) {
   long double x = __extendhftf2(a);
-  int ret = compareResultLD(x, expectedHi, expectedLo);
+  int ret = compareResultF128(x, expectedHi, expectedLo);
 
   if (ret) {
     printf("error in test__extendhftf2(%#.4x) = %.20Lf, "
diff --git a/compiler-rt/test/builtins/Unit/extendsftf2_test.c b/compiler-rt/test/builtins/Unit/extendsftf2_test.c
index 19dd5b02c07bd26..6ce9bd81a3dd919 100644
--- a/compiler-rt/test/builtins/Unit/extendsftf2_test.c
+++ b/compiler-rt/test/builtins/Unit/extendsftf2_test.c
@@ -13,7 +13,7 @@ COMPILER_RT_ABI long double __extendsftf2(float a);
 int test__extendsftf2(float a, uint64_t expectedHi, uint64_t expectedLo)
 {
     long double x = __extendsftf2(a);
-    int ret = compareResultLD(x, expectedHi, expectedLo);
+    int ret = compareResultF128(x, expectedHi, expectedLo);
 
     if (ret)
     {
diff --git a/compiler-rt/test/builtins/Unit/extendxftf2_test.c b/compiler-rt/test/builtins/Unit/extendxftf2_test.c
new file mode 100644
index 000000000000000..f5211875438c732
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/extendxftf2_test.c
@@ -0,0 +1,74 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_extendxftf2
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#if __LDBL_MANT_DIG__ == 64 && defined(__x86_64__) &&                          \
+    (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__))
+
+#include "fp_test.h"
+
+COMPILER_RT_ABI __float128 __extendxftf2(long double a);
+
+int test__extendxftf2(long double a, uint64_t expectedHi, uint64_t expectedLo) {
+  __float128 x = __extendxftf2(a);
+  int ret = compareResultF128(x, expectedHi, expectedLo);
+
+  if (ret) {
+    printf("error in __extendxftf2(%.20Lf) = %.20Lf, "
+           "expected %.20Lf\n",
+           a, x, fromRep128(expectedHi, expectedLo));
+  }
+  return ret;
+}
+
+char assumption_1[sizeof(long double) * CHAR_BIT == 128] = {0};
+
+#endif
+
+int main() {
+#if __LDBL_MANT_DIG__ == 64 && defined(__x86_64__) &&                          \
+    (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__))
+  // qNaN
+  if (test__extendxftf2(makeQNaN80(), UINT64_C(0x7fff800000000000),
+                        UINT64_C(0x0)))
+    return 1;
+  // NaN
+  if (test__extendxftf2(makeNaN80(UINT64_C(0x3fffffffffffffff)),
+                        UINT64_C(0x7fff7fffffffffff),
+                        UINT64_C(0xfffe000000000000)))
+    return 1;
+  // inf
+  if (test__extendxftf2(makeInf80(), UINT64_C(0x7fff000000000000),
+                        UINT64_C(0x0)))
+    return 1;
+  // zero
+  if (test__extendxftf2(0.0, UINT64_C(0x0), UINT64_C(0x0)))
+    return 1;
+  if (test__extendxftf2(0x1.23456789abcdefp+5, UINT64_C(0x400423456789abcd),
+                        UINT64_C(0xf000000000000000)))
+    return 1;
+  if (test__extendxftf2(0x1.edcba987654321fp-9, UINT64_C(0x3ff6edcba9876543),
+                        UINT64_C(0x2000000000000000)))
+    return 1;
+  if (test__extendxftf2(0x1.23456789abcdefp+45, UINT64_C(0x402c23456789abcd),
+                        UINT64_C(0xf000000000000000)))
+    return 1;
+  if (test__extendxftf2(0x1.edcba987654321fp-45, UINT64_C(0x3fd2edcba9876543),
+                        UINT64_C(0x2000000000000000)))
+    return 1;
+  // denormal number
+  if (test__extendxftf2(1e-4932L, UINT64_C(0x00004c248f91e526),
+                        UINT64_C(0xafe0000000000000)))
+    return 1;
+  // denormal number
+  if (test__extendxftf2(2e-4932L, UINT64_C(0x000098491f23ca4d),
+                        UINT64_C(0x5fc0000000000000)))
+    return 1;
+#else
+  printf("skipped\n");
+
+#endif
+  return 0;
+}
diff --git a/compiler-rt/test/builtins/Unit/floatditf_test.c b/compiler-rt/test/builtins/Unit/floatditf_test.c
index 4d5da32ec25d42f..fe7a5fd86ae8423 100644
--- a/compiler-rt/test/builtins/Unit/floatditf_test.c
+++ b/compiler-rt/test/builtins/Unit/floatditf_test.c
@@ -17,7 +17,7 @@ COMPILER_RT_ABI long double __floatditf(di_int a);
 int test__floatditf(di_int a, uint64_t expectedHi, uint64_t expectedLo)
 {
     long double x = __floatditf(a);
-    int ret = compareResultLD(x, expectedHi, expectedLo);
+    int ret = compareResultF128(x, expectedHi, expectedLo);
 
     if (ret)
         printf("error in __floatditf(%Ld) = %.20Lf, "
diff --git a/compiler-rt/test/builtins/Unit/floatsitf_test.c b/compiler-rt/test/builtins/Unit/floatsitf_test.c
index 751a4a9b9207afb..b6571b9ba223d9b 100644
--- a/compiler-rt/test/builtins/Unit/floatsitf_test.c
+++ b/compiler-rt/test/builtins/Unit/floatsitf_test.c
@@ -13,7 +13,7 @@ COMPILER_RT_ABI long double __floatsitf(si_int a);
 int test__floatsitf(si_int a, uint64_t expectedHi, uint64_t expectedLo)
 {
     long double x = __floatsitf(a);
-    int ret = compareResultLD(x, expectedHi, expectedLo);
+    int ret = compareResultF128(x, expectedHi, expectedLo);
 
     if (ret)
     {
diff --git a/compiler-rt/test/builtins/Unit/floatunditf_test.c b/compiler-rt/test/builtins/Unit/floatunditf_test.c
index d44ae7934145a6d..8da78da9760293a 100644
--- a/compiler-rt/test/builtins/Unit/floatunditf_test.c
+++ b/compiler-rt/test/builtins/Unit/floatunditf_test.c
@@ -17,7 +17,7 @@ COMPILER_RT_ABI long double __floatunditf(du_int a);
 int test__floatunditf(du_int a, uint64_t expectedHi, uint64_t expectedLo)
 {
     long double x = __floatunditf(a);
-    int ret = compareResultLD(x, expectedHi, expectedLo);
+    int ret = compareResultF128(x, expectedHi, expectedLo);
 
     if (ret)
         printf("error in __floatunditf(%Lu) = %.20Lf, "
diff --git a/compiler-rt/test/builtins/Unit/floatunsitf_test.c b/compiler-rt/test/builtins/Unit/floatunsitf_test.c
index f0a6c63eb83799d..b6b1ba045739900 100644
--- a/compiler-rt/test/builtins/Unit/floatunsitf_test.c
+++ b/compiler-rt/test/builtins/Unit/floatunsitf_test.c
@@ -13,7 +13,7 @@ COMPILER_RT_ABI long double __floatunsitf(su_int a);
 int test__floatunsitf(su_int a, uint64_t expectedHi, uint64_t expectedLo)
 {
     long double x = __floatunsitf(a);
-    int ret = compareResultLD(x, expectedHi, expectedLo);
+    int ret = compareResultF128(x, expectedHi, expectedLo);
 
     if (ret){
         printf("error in test__floatunsitf(%u) = %.20Lf, "
diff --git a/compiler-rt/test/builtins/Unit/fp_test.h b/compiler-rt/test/builtins/Unit/fp_test.h
index e54dfc108e71887..f095ae0701d77e6 100644
--- a/compiler-rt/test/builtins/Unit/fp_test.h
+++ b/compiler-rt/test/builtins/Unit/fp_test.h
@@ -9,6 +9,18 @@
 #define TYPE_FP16 uint16_t
 #endif
 
+// TODO: Switch to using fp_lib.h once QUAD_PRECISION is available on x86_64.
+#if __LDBL_MANT_DIG__ == 113 ||                                                \
+    ((__LDBL_MANT_DIG__ == 64) && defined(__x86_64__) &&                       \
+     (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__)))
+#if __LDBL_MANT_DIG__ == 113
+#define TYPE_FP128 long double
+#else
+#define TYPE_FP128 __float128
+#endif
+#define TEST_COMPILER_RT_HAS_FLOAT128
+#endif
+
 enum EXPECTED_RESULT {
     LESS_0, LESS_EQUAL_0, EQUAL_0, GREATER_0, GREATER_EQUAL_0, NEQUAL_0
 };
@@ -38,11 +50,10 @@ static inline double fromRep64(uint64_t x)
     return ret;
 }
 
-#if __LDBL_MANT_DIG__ == 113
-static inline long double fromRep128(uint64_t hi, uint64_t lo)
-{
+#ifdef TEST_COMPILER_RT_HAS_FLOAT128
+static inline TYPE_FP128 fromRep128(uint64_t hi, uint64_t lo) {
     __uint128_t x = ((__uint128_t)hi << 64) + lo;
-    long double ret;
+    TYPE_FP128 ret;
     memcpy(&ret, &x, 16);
     return ret;
 }
@@ -73,9 +84,8 @@ static inline uint64_t toRep64(double x)
     return ret;
 }
 
-#if __LDBL_MANT_DIG__ == 113
-static inline __uint128_t toRep128(long double x)
-{
+#ifdef TEST_COMPILER_RT_HAS_FLOAT128
+static inline __uint128_t toRep128(TYPE_FP128 x) {
     __uint128_t ret;
     memcpy(&ret, &x, 16);
     return ret;
@@ -136,25 +146,23 @@ static inline int compareResultD(double result,
     return 1;
 }
 
-#if __LDBL_MANT_DIG__ == 113
+#ifdef TEST_COMPILER_RT_HAS_FLOAT128
 // return 0 if equal
 // use two 64-bit integers instead of one 128-bit integer
 // because 128-bit integer constant can't be assigned directly
-static inline int compareResultLD(long double result,
-                                  uint64_t expectedHi,
-                                  uint64_t expectedLo)
-{
+static inline int compareResultF128(TYPE_FP128 result, uint64_t expectedHi,
+                                    uint64_t expectedLo) {
     __uint128_t rep = toRep128(result);
     uint64_t hi = rep >> 64;
     uint64_t lo = rep;
 
-    if (hi == expectedHi && lo == expectedLo){
+    if (hi == expectedHi && lo == expectedLo) {
         return 0;
     }
     // test other possible NaN representation(signal NaN)
-    else if (expectedHi == 0x7fff800000000000UL && expectedLo == 0x0UL){
+    else if (expectedHi == 0x7fff800000000000UL && expectedLo == 0x0UL) {
         if ((hi & 0x7fff000000000000UL) == 0x7fff000000000000UL &&
-            ((hi & 0xffffffffffffUL) > 0 || lo > 0)){
+            ((hi & 0xffffffffffffUL) > 0 || lo > 0)) {
             return 0;
         }
     }
@@ -232,9 +240,45 @@ static inline double makeQNaN64(void)
     return fromRep64(0x7ff8000000000000UL);
 }
 
-#if __LDBL_MANT_DIG__ == 113
-static inline long double makeQNaN128(void)
-{
+#if __LDBL_MANT_DIG__ == 64 && defined(__x86_64__)
+static inline long double F80FromRep128(uint64_t hi, uint64_t lo) {
+    __uint128_t x = ((__uint128_t)hi << 64) + lo;
+    long double ret;
+    memcpy(&ret, &x, 16);
+    return ret;
+}
+
+static inline __uint128_t F80ToRep128(long double x) {
+    __uint128_t ret;
+    memcpy(&ret, &x, 16);
+    return ret;
+}
+
+static inline int compareResultF80(long double result, uint64_t expectedHi,
+                                   uint64_t expectedLo) {
+    __uint128_t rep = F80ToRep128(result);
+    // F80 occupies the lower 80 bits of __uint128_t.
+    uint64_t hi = (rep >> 64) & ((1UL << (80 - 64)) - 1);
+    uint64_t lo = rep;
+    return !(hi == expectedHi && lo == expectedLo);
+}
+
+static inline long double makeQNaN80(void) {
+    return F80FromRep128(0x7fffUL, 0xc000000000000000UL);
+}
+
+static inline long double makeNaN80(uint64_t rand) {
+    return F80FromRep128(0x7fffUL,
+                         0x8000000000000000 | (rand & 0x3fffffffffffffff));
+}
+
+static inline long double makeInf80(void) {
+    return F80FromRep128(0x7fffUL, 0x8000000000000000UL);
+}
+#endif
+
+#ifdef TEST_COMPILER_RT_HAS_FLOAT128
+static inline TYPE_FP128 makeQNaN128(void) {
     return fromRep128(0x7fff800000000000UL, 0x0UL);
 }
 #endif
@@ -254,9 +298,8 @@ static inline double makeNaN64(uint64_t rand)
     return fromRep64(0x7ff0000000000000UL | (rand & 0xfffffffffffffUL));
 }
 
-#if __LDBL_MANT_DIG__ == 113
-static inline long double makeNaN128(uint64_t rand)
-{
+#ifdef TEST_COMPILER_RT_HAS_FLOAT128
+static inline TYPE_FP128 makeNaN128(uint64_t rand) {
     return fromRep128(0x7fff000000000000UL | (rand & 0xffffffffffffUL), 0x0UL);
 }
 #endif
@@ -286,14 +329,12 @@ static inline double makeNegativeInf64(void)
     return fromRep64(0xfff0000000000000UL);
 }
 
-#if __LDBL_MANT_DIG__ == 113
-static inline long double makeInf128(void)
-{
+#ifdef TEST_COMPILER_RT_HAS_FLOAT128
+static inline TYPE_FP128 makeInf128(void) {
     return fromRep128(0x7fff000000000000UL, 0x0UL);
 }
 
-static inline long double makeNegativeInf128(void)
-{
+static inline TYPE_FP128 makeNegativeInf128(void) {
     return fromRep128(0xffff000000000000UL, 0x0UL);
 }
 #endif
diff --git a/compiler-rt/test/builtins/Unit/multf3_test.c b/compiler-rt/test/builtins/Unit/multf3_test.c
index 3bf6ab24cec0221..543b55899ce82a9 100644
--- a/compiler-rt/test/builtins/Unit/multf3_test.c
+++ b/compiler-rt/test/builtins/Unit/multf3_test.c
@@ -15,7 +15,7 @@ int test__multf3(long double a, long double b,
                  uint64_t expectedHi, uint64_t expectedLo)
 {
     long double x = __multf3(a, b);
-    int ret = compareResultLD(x, expectedHi, expectedLo);
+    int ret = compareResultF128(x, expectedHi, expectedLo);
 
     if (ret){
         printf("error in test__multf3(%.20Lf, %.20Lf) = %.20Lf, "
diff --git a/compiler-rt/test/builtins/Unit/subtf3_test.c b/compiler-rt/test/builtins/Unit/subtf3_test.c
index 377ae95a9a7d7bb..724fa4820d99d32 100644
--- a/compiler-rt/test/builtins/Unit/subtf3_test.c
+++ b/compiler-rt/test/builtins/Unit/subtf3_test.c
@@ -16,7 +16,7 @@ int test__subtf3(long double a, long double b,
                  uint64_t expectedHi, uint64_t expectedLo)
 {
     long double x = __subtf3(a, b);
-    int ret = compareResultLD(x, expectedHi, expectedLo);
+    int ret = compareResultF128(x, expectedHi, expectedLo);
 
     if (ret){
         printf("error in test__subtf3(%.20Lf, %.20Lf) = %.20Lf, "
diff --git a/compiler-rt/test/builtins/Unit/trunctfxf2_test.c b/compiler-rt/test/builtins/Unit/trunctfxf2_test.c
new file mode 100644
index 000000000000000..53024ef139624aa
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/trunctfxf2_test.c
@@ -0,0 +1,97 @@
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_trunctfxf2
+
+#include "int_lib.h"
+#include <stdio.h>
+
+#if __LDBL_MANT_DIG__ == 64 && defined(__x86_64__) &&                          \
+    (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__))
+
+#include "fp_test.h"
+
+COMPILER_RT_ABI long double __trunctfxf2(__float128 a);
+
+int test__trunctfxf2(__float128 a, uint64_t expectedHi, uint64_t expectedLo) {
+  long double x = __trunctfxf2(a);
+  int ret = compareResultF80(x, expectedHi, expectedLo);
+  ;
+  if (ret) {
+    printf("error in __trunctfxf2(%.20Lf) = %.20Lf, "
+           "expected %.20Lf\n",
+           a, x, fromRep128(expectedHi, expectedLo));
+  }
+  return ret;
+}
+
+char assumption_1[sizeof(long double) * CHAR_BIT == 128] = {0};
+
+#endif
+
+int main() {
+#if __LDBL_MANT_DIG__ == 64 && defined(__x86_64__) &&                          \
+    (defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__))
+  // qNaN
+  if (test__trunctfxf2(makeQNaN128(), UINT64_C(0x7FFF),
+                       UINT64_C(0xC000000000000000)))
+    return 1;
+  // NaN
+  if (test__trunctfxf2(makeNaN128(UINT64_C(0x810000000000)), UINT64_C(0x7FFF),
+                       UINT64_C(0xC080000000000000)))
+    return 1;
+  // inf
+  if (test__trunctfxf2(makeInf128(), UINT64_C(0x7FFF),
+                       UINT64_C(0x8000000000000000)))
+    return 1;
+  // zero
+  if (test__trunctfxf2(0.0Q, UINT64_C(0x0), UINT64_C(0x0)))
+    return 1;
+  if (test__trunctfxf2(0x1.af23456789bbaaab347645365cdep+5L, UINT64_C(0x4004),
+                       UINT64_C(0xd791a2b3c4ddd556)))
+    return 1;
+  if (test__trunctfxf2(0x1.dedafcff354b6ae9758763545432p-9L, UINT64_C(0x3ff6),
+                       UINT64_C(0xef6d7e7f9aa5b575)))
+    return 1;
+  if (test__trunctfxf2(0x1.2f34dd5f437e849b4baab754cdefp+4534L,
+                       UINT64_C(0x51b5), UINT64_C(0x979a6eafa1bf424e)))
+    return 1;
+  if (test__trunctfxf2(0x1.edcbff8ad76ab5bf46463233214fp-435L, UINT64_C(0x3e4c),
+                       UINT64_C(0xf6e5ffc56bb55ae0)))
+    return 1;
+
+  // Test rounding near halfway.
+  __float128 halfwayPlus =
+      fromRep128(UINT64_C(0x7ffa000000000000),
+                 ((UINT64_C(1) << (112 - 63 - 1)) + UINT64_C(1)));
+  if (test__trunctfxf2(halfwayPlus, UINT64_C(0x7ffa),
+                       UINT64_C(0x8000000000000001)))
+    return 1;
+  __float128 halfwayExactOdd = fromRep128(
+      UINT64_C(0x7ffa000000000000),
+      ((UINT64_C(1) << (112 - 63)) + (UINT64_C(1) << (112 - 63 - 1))));
+  if (test__trunctfxf2(halfwayExactOdd, UINT64_C(0x7ffa),
+                       UINT64_C(0x8000000000000002)))
+    return 1;
+  __float128 halfwayExactEven =
+      fromRep128(UINT64_C(0x7ffa000000000000), (UINT64_C(1) << (112 - 63 - 1)));
+  if (test__trunctfxf2(halfwayExactEven, UINT64_C(0x7ffa),
+                       UINT64_C(0x8000000000000000)))
+    return 1;
+  __float128 halfwayRoundingWillChangeExponent =
+      fromRep128(UINT64_C(0x7ffaffffffffffff), UINT64_C(0xffff000000000001));
+  if (test__trunctfxf2(halfwayRoundingWillChangeExponent, UINT64_C(0x7ffb),
+                       UINT64_C(0x8000000000000000)))
+    return 1;
+
+  // denormal number
+  if (test__trunctfxf2(1e-4932Q, UINT64_C(0), UINT64_C(0x261247c8f29357f0)))
+    return 1;
+  // denormal number
+  if (test__trunctfxf2(2e-4932Q, UINT64_C(0), UINT64_C(0x4c248f91e526afe0)))
+    return 1;
+
+#else
+  printf("skipped\n");
+
+#endif
+  return 0;
+}

>From 1c317869e8008fe8d59e5f435fef77aa60b937ce Mon Sep 17 00:00:00 2001
From: Greg Clayton <gclayton at fb.com>
Date: Mon, 16 Oct 2023 16:24:07 -0700
Subject: [PATCH 07/14] llvm-gsymutil now handles empty linkage names
 correctly. (#68931)

Previous to this fix, if we had a DW_TAG_subprogram that had a
DW_AT_linkage_name that was empty, it would attempt to use this name
which would cause an error to be emitted when saving the gsym file to
disk:

error: DWARF conversion failed: : attempted to encode invalid
FunctionInfo object

This patch fixes this issue and adds a unit test case.
---
 llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp |  10 +-
 llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp   | 152 +++++++++++++++++++
 2 files changed, 157 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
index e38347f15e3ae8b..d720c1e33495515 100644
--- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
+++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
@@ -132,11 +132,11 @@ static DWARFDie GetParentDeclContextDIE(DWARFDie &Die) {
 static std::optional<uint32_t>
 getQualifiedNameIndex(DWARFDie &Die, uint64_t Language, GsymCreator &Gsym) {
   // If the dwarf has mangled name, use mangled name
-  if (auto LinkageName =
-          dwarf::toString(Die.findRecursively({dwarf::DW_AT_MIPS_linkage_name,
-                                               dwarf::DW_AT_linkage_name}),
-                          nullptr))
-    return Gsym.insertString(LinkageName, /* Copy */ false);
+  if (auto LinkageName = Die.getLinkageName()) {
+    // We have seen cases were linkage name is actually empty.
+    if (strlen(LinkageName) > 0)
+      return Gsym.insertString(LinkageName, /* Copy */ false);
+  }
 
   StringRef ShortName(Die.getName(DINameKind::ShortName));
   if (ShortName.empty())
diff --git a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
index 58bc83997d1a926..ad81a2fcd16441a 100644
--- a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
+++ b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
@@ -4005,3 +4005,155 @@ TEST(GSYMTest, TestEmptyRangeWarnings) {
   // Make sure we don't see spurious errors in the output:
   EXPECT_TRUE(errors.find("error:") == std::string::npos);
 }
+
+
+TEST(GSYMTest, TestEmptyLinkageName) {
+  // This example has a single compile unit that has a DW_TAG_subprogram that
+  // has a function that has an empty linkage name and a valid normal name.
+  // Previously this would cause an encoding error:
+  //
+  // DWARF conversion failed: attempted to encode invalid FunctionInfo object
+  //
+  // This was because we would get a valid but empty linkage name and we would
+  // try to use this in the GSYM FunctionInfo and that would cause the error
+  // as the name was empty.
+  //
+  // 0x0000000b: DW_TAG_compile_unit
+  //               DW_AT_name        ("/tmp/main.cpp")
+  //               DW_AT_language    (DW_LANG_C)
+  //               DW_AT_stmt_list   (0x00000000)
+  //
+  // 0x00000015:   DW_TAG_subprogram
+  //                 DW_AT_name      ("foo")
+  //                 DW_AT_linkage_name      ("")
+  //                 DW_AT_low_pc    (0x0000000000001000)
+  //                 DW_AT_high_pc   (0x0000000000001050)
+  //
+  // 0x0000002e:   NULL
+
+
+  StringRef yamldata = R"(
+  debug_str:
+    - ''
+    - '/tmp/main.cpp'
+    - foo
+    - ''
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_udata
+            - Attribute:       DW_AT_stmt_list
+              Form:            DW_FORM_sec_offset
+        - Code:            0x2
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_linkage_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+  debug_info:
+    - Length:          0x2B
+      Version:         4
+      AbbrevTableID:   0
+      AbbrOffset:      0x0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x1
+            - Value:           0x2
+            - Value:           0x0
+        - AbbrCode:        0x2
+          Values:
+            - Value:           0xF
+            - Value:           0x13
+            - Value:           0x1000
+            - Value:           0x1050
+        - AbbrCode:        0x0
+  debug_line:
+    - Length:          68
+      Version:         2
+      PrologueLength:  36
+      MinInstLength:   1
+      DefaultIsStmt:   1
+      LineBase:        251
+      LineRange:       14
+      OpcodeBase:      13
+      StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ]
+      IncludeDirs:
+        - '/tmp'
+      Files:
+        - Name:            main.cpp
+          DirIdx:          1
+          ModTime:         0
+          Length:          0
+      Opcodes:
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          9
+          SubOpcode:       DW_LNE_set_address
+          Data:            4096
+        - Opcode:          DW_LNS_advance_line
+          SData:           9
+          Data:            0
+        - Opcode:          DW_LNS_copy
+          Data:            0
+        - Opcode:          DW_LNS_advance_pc
+          Data:            256
+        - Opcode:          DW_LNS_advance_line
+          SData:           1
+          Data:            0
+        - Opcode:          DW_LNS_copy
+          Data:            0
+        - Opcode:          DW_LNS_advance_pc
+          Data:            256
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          1
+          SubOpcode:       DW_LNE_end_sequence
+          Data:            0
+  )";
+  auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
+  ASSERT_THAT_EXPECTED(ErrOrSections, Succeeded());
+  std::unique_ptr<DWARFContext> DwarfContext =
+      DWARFContext::create(*ErrOrSections, 8);
+  ASSERT_TRUE(DwarfContext.get() != nullptr);
+  std::string errors;
+  raw_string_ostream OS(errors);
+  GsymCreator GC;
+  DwarfTransformer DT(*DwarfContext, GC);
+  const uint32_t ThreadCount = 1;
+  ASSERT_THAT_ERROR(DT.convert(ThreadCount, &OS), Succeeded());
+  ASSERT_THAT_ERROR(GC.finalize(OS), Succeeded());
+  OS.flush();
+  SmallString<512> Str;
+  raw_svector_ostream OutStrm(Str);
+  const auto ByteOrder = llvm::endianness::native;
+  FileWriter FW(OutStrm, ByteOrder);
+  ASSERT_THAT_ERROR(GC.encode(FW), Succeeded());
+  Expected<GsymReader> GR = GsymReader::copyBuffer(OutStrm.str());
+  ASSERT_THAT_EXPECTED(GR, Succeeded());
+  // There should be one function in our GSYM.
+  EXPECT_EQ(GR->getNumAddresses(), 1u);
+  // Verify "foo" is present and has a line table and no inline info.
+  auto ExpFI = GR->getFunctionInfo(0x1000);
+  ASSERT_THAT_EXPECTED(ExpFI, Succeeded());
+  ASSERT_EQ(ExpFI->Range, AddressRange(0x1000, 0x1050));
+  EXPECT_TRUE(ExpFI->OptLineTable.has_value());
+  EXPECT_FALSE(ExpFI->Inline.has_value());
+  StringRef FuncName = GR->getString(ExpFI->Name);
+  EXPECT_EQ(FuncName, "foo");
+
+  // Make sure we don't see spurious errors in the output:
+  EXPECT_TRUE(errors.find("error:") == std::string::npos);
+}

>From 9ed0f771ccc7b850e09b4cf2028ef2455629ffee Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler at users.noreply.github.com>
Date: Mon, 16 Oct 2023 16:26:06 -0700
Subject: [PATCH 08/14] [flang] Fix CFI_CDESC_T for C++ interoperability
 (#67568)

Full namespace qualification is needed on an identifier.
---
 flang/include/flang/ISO_Fortran_binding.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flang/include/flang/ISO_Fortran_binding.h b/flang/include/flang/ISO_Fortran_binding.h
index 2893fd46c267d9b..51d6219427cce5e 100644
--- a/flang/include/flang/ISO_Fortran_binding.h
+++ b/flang/include/flang/ISO_Fortran_binding.h
@@ -169,7 +169,7 @@ template <int r> struct CdescStorage : public CFI_cdesc_t {
 template <> struct CdescStorage<1> : public CFI_cdesc_t {};
 template <> struct CdescStorage<0> : public CFI_cdesc_t {};
 } // namespace cfi_internal
-#define CFI_CDESC_T(rank) cfi_internal::CdescStorage<rank>
+#define CFI_CDESC_T(rank) ::Fortran::ISO::cfi_internal::CdescStorage<rank>
 #else
 #define CFI_CDESC_T(_RANK) \
   struct { \
@@ -200,8 +200,8 @@ RT_API_ATTRS int CFI_setpointer(
 #ifdef __cplusplus
 } // extern "C"
 } // inline namespace Fortran_2018
-}
-}
+} // namespace ISO
+} // namespace Fortran
 #endif
 
 #endif /* CFI_ISO_FORTRAN_BINDING_H_ */

>From 93cc7a65392287d735002c0da2679a70e0786093 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler at users.noreply.github.com>
Date: Mon, 16 Oct 2023 16:36:46 -0700
Subject: [PATCH 09/14] [flang] Remove IEEE_DENORM from IEEE_ALL (#67573)

The array of all exceptions IEEE_ALL defined in the intrinsic module
IEEE_EXCEPTIONS should contain only what the standard mandates. Existing
code depends on it having only five elements. The legacy extension
exception flag IEEE_DENORM shouldn't be an element.
---
 flang/module/__fortran_ieee_exceptions.f90 | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/flang/module/__fortran_ieee_exceptions.f90 b/flang/module/__fortran_ieee_exceptions.f90
index 77dc6f85517869b..785c4adaec25d55 100644
--- a/flang/module/__fortran_ieee_exceptions.f90
+++ b/flang/module/__fortran_ieee_exceptions.f90
@@ -27,10 +27,8 @@
     ieee_denorm = ieee_flag_type(32) ! PGI extension
 
   type(ieee_flag_type), parameter :: &
-    ieee_usual(*) = [ &
-      ieee_overflow, ieee_divide_by_zero, ieee_invalid ], &
-    ieee_all(*) = [ &
-      ieee_usual, ieee_underflow, ieee_inexact, ieee_denorm ]
+    ieee_usual(*) = [ ieee_overflow, ieee_divide_by_zero, ieee_invalid ], &
+    ieee_all(*) = [ ieee_usual, ieee_underflow, ieee_inexact ]
 
   type :: ieee_modes_type ! Fortran 2018, 17.7
     private

>From 39dbc06ae341734a65e9d627acdcc2f6bf5f7ae0 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Sun, 15 Oct 2023 08:49:46 -0700
Subject: [PATCH 10/14] [RISCV] Pre-commit concat-vectors-constant-stride.ll

This patch commits tests that can be optimized by improving
performCONCAT_VECTORCombine to do a better job at decomposing the base
pointer and recognizing a constant offset.
---
 .../rvv/concat-vectors-constant-stride.ll     | 231 ++++++++++++++++++
 1 file changed, 231 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/concat-vectors-constant-stride.ll

diff --git a/llvm/test/CodeGen/RISCV/rvv/concat-vectors-constant-stride.ll b/llvm/test/CodeGen/RISCV/rvv/concat-vectors-constant-stride.ll
new file mode 100644
index 000000000000000..611270ab98ebdaf
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/concat-vectors-constant-stride.ll
@@ -0,0 +1,231 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v,+unaligned-vector-mem -target-abi=ilp32 \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v,+unaligned-vector-mem -target-abi=lp64 \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+
+define void @constant_forward_stride(ptr %s, ptr %d) {
+; CHECK-LABEL: constant_forward_stride:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a2, a0, 16
+; CHECK-NEXT:    addi a3, a0, 32
+; CHECK-NEXT:    addi a4, a0, 48
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v9, (a2)
+; CHECK-NEXT:    vle8.v v10, (a3)
+; CHECK-NEXT:    vle8.v v11, (a4)
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v11, 6
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = getelementptr inbounds i8, ptr %s, i64 16
+  %2 = getelementptr inbounds i8, ptr %s, i64 32
+  %3 = getelementptr inbounds i8, ptr %s, i64 48
+  %4 = load <2 x i8>, ptr %s, align 1
+  %5 = load <2 x i8>, ptr %1, align 1
+  %6 = load <2 x i8>, ptr %2, align 1
+  %7 = load <2 x i8>, ptr %3, align 1
+  %8 = shufflevector <2 x i8> %4, <2 x i8> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %9 = shufflevector <2 x i8> %6, <2 x i8> %7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %10 = shufflevector <4 x i8> %8, <4 x i8> %9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i8> %10, ptr %d, align 1
+  ret void
+}
+
+define void @constant_forward_stride2(ptr %s, ptr %d) {
+; CHECK-LABEL: constant_forward_stride2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a2, a0, -16
+; CHECK-NEXT:    addi a3, a0, -32
+; CHECK-NEXT:    addi a4, a0, -48
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a4)
+; CHECK-NEXT:    vle8.v v9, (a3)
+; CHECK-NEXT:    vle8.v v10, (a2)
+; CHECK-NEXT:    vle8.v v11, (a0)
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v11, 6
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = getelementptr inbounds i8, ptr %s, i64 -16
+  %2 = getelementptr inbounds i8, ptr %s, i64 -32
+  %3 = getelementptr inbounds i8, ptr %s, i64 -48
+  %4 = load <2 x i8>, ptr %3, align 1
+  %5 = load <2 x i8>, ptr %2, align 1
+  %6 = load <2 x i8>, ptr %1, align 1
+  %7 = load <2 x i8>, ptr %s, align 1
+  %8 = shufflevector <2 x i8> %4, <2 x i8> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %9 = shufflevector <2 x i8> %6, <2 x i8> %7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %10 = shufflevector <4 x i8> %8, <4 x i8> %9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i8> %10, ptr %d, align 1
+  ret void
+}
+
+define void @constant_forward_stride3(ptr %s, ptr %d) {
+; CHECK-LABEL: constant_forward_stride3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a2, a0, 16
+; CHECK-NEXT:    addi a3, a0, 32
+; CHECK-NEXT:    addi a4, a0, 48
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v9, (a2)
+; CHECK-NEXT:    vle8.v v10, (a3)
+; CHECK-NEXT:    vle8.v v11, (a4)
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v11, 6
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = getelementptr inbounds i8, ptr %s, i64 16
+  %2 = getelementptr inbounds i8, ptr %s, i64 32
+  %3 = getelementptr inbounds i8, ptr %s, i64 48
+  %4 = getelementptr inbounds i8, ptr %1, i64 0
+  %5 = getelementptr inbounds i8, ptr %2, i64 0
+  %6 = getelementptr inbounds i8, ptr %3, i64 0
+  %7 = load <2 x i8>, ptr %s, align 1
+  %8 = load <2 x i8>, ptr %4, align 1
+  %9 = load <2 x i8>, ptr %5, align 1
+  %10 = load <2 x i8>, ptr %6, align 1
+  %11 = shufflevector <2 x i8> %7, <2 x i8> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = shufflevector <2 x i8> %9, <2 x i8> %10, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %13 = shufflevector <4 x i8> %11, <4 x i8> %12, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i8> %13, ptr %d, align 1
+  ret void
+}
+
+define void @constant_back_stride(ptr %s, ptr %d) {
+; CHECK-LABEL: constant_back_stride:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a2, a0, -16
+; CHECK-NEXT:    addi a3, a0, -32
+; CHECK-NEXT:    addi a4, a0, -48
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v9, (a2)
+; CHECK-NEXT:    vle8.v v10, (a3)
+; CHECK-NEXT:    vle8.v v11, (a4)
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v11, 6
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = getelementptr inbounds i8, ptr %s, i64 -16
+  %2 = getelementptr inbounds i8, ptr %s, i64 -32
+  %3 = getelementptr inbounds i8, ptr %s, i64 -48
+  %4 = load <2 x i8>, ptr %s, align 1
+  %5 = load <2 x i8>, ptr %1, align 1
+  %6 = load <2 x i8>, ptr %2, align 1
+  %7 = load <2 x i8>, ptr %3, align 1
+  %8 = shufflevector <2 x i8> %4, <2 x i8> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %9 = shufflevector <2 x i8> %6, <2 x i8> %7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %10 = shufflevector <4 x i8> %8, <4 x i8> %9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i8> %10, ptr %d, align 1
+  ret void
+}
+
+define void @constant_back_stride2(ptr %s, ptr %d) {
+; CHECK-LABEL: constant_back_stride2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a2, a0, 16
+; CHECK-NEXT:    addi a3, a0, 32
+; CHECK-NEXT:    addi a4, a0, 48
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a4)
+; CHECK-NEXT:    vle8.v v9, (a3)
+; CHECK-NEXT:    vle8.v v10, (a2)
+; CHECK-NEXT:    vle8.v v11, (a0)
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v11, 6
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = getelementptr inbounds i8, ptr %s, i64 16
+  %2 = getelementptr inbounds i8, ptr %s, i64 32
+  %3 = getelementptr inbounds i8, ptr %s, i64 48
+  %4 = load <2 x i8>, ptr %3, align 1
+  %5 = load <2 x i8>, ptr %2, align 1
+  %6 = load <2 x i8>, ptr %1, align 1
+  %7 = load <2 x i8>, ptr %s, align 1
+  %8 = shufflevector <2 x i8> %4, <2 x i8> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %9 = shufflevector <2 x i8> %6, <2 x i8> %7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %10 = shufflevector <4 x i8> %8, <4 x i8> %9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i8> %10, ptr %d, align 1
+  ret void
+}
+
+define void @constant_back_stride3(ptr %s, ptr %d) {
+; CHECK-LABEL: constant_back_stride3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a2, a0, -16
+; CHECK-NEXT:    addi a3, a0, -32
+; CHECK-NEXT:    addi a4, a0, -48
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v9, (a2)
+; CHECK-NEXT:    vle8.v v10, (a3)
+; CHECK-NEXT:    vle8.v v11, (a4)
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v11, 6
+; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    ret
+  %1 = getelementptr inbounds i8, ptr %s, i64 -16
+  %2 = getelementptr inbounds i8, ptr %s, i64 -32
+  %3 = getelementptr inbounds i8, ptr %s, i64 -48
+  %4 = getelementptr inbounds i8, ptr %1, i64 0
+  %5 = getelementptr inbounds i8, ptr %2, i64 0
+  %6 = getelementptr inbounds i8, ptr %3, i64 0
+  %7 = load <2 x i8>, ptr %s, align 1
+  %8 = load <2 x i8>, ptr %4, align 1
+  %9 = load <2 x i8>, ptr %5, align 1
+  %10 = load <2 x i8>, ptr %6, align 1
+  %11 = shufflevector <2 x i8> %7, <2 x i8> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %12 = shufflevector <2 x i8> %9, <2 x i8> %10, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %13 = shufflevector <4 x i8> %11, <4 x i8> %12, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  store <8 x i8> %13, ptr %d, align 1
+  ret void
+}
+
+define void @constant_zero_stride(ptr %s, ptr %d) {
+; CHECK-LABEL: constant_zero_stride:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v8
+; CHECK-NEXT:    vslideup.vi v9, v8, 2
+; CHECK-NEXT:    vse8.v v9, (a1)
+; CHECK-NEXT:    ret
+  %1 = getelementptr inbounds i8, ptr %s, i64 0
+  %2 = load <2 x i8>, ptr %s, align 1
+  %3 = load <2 x i8>, ptr %1, align 1
+  %4 = shufflevector <2 x i8> %2, <2 x i8> %3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x i8> %4, ptr %d, align 1
+  ret void
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}

>From 51ebcbffdc0be69491b6d16cf8bd2c0d8ac0597e Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Sun, 15 Oct 2023 09:00:04 -0700
Subject: [PATCH 11/14] [RISCV] Improve performCONCAT_VECTORCombine stride
 matching

If the load ptrs can be decomposed into a common (Base + Index) with a
common constant stride, then return the constant stride.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  21 +++-
 .../rvv/concat-vectors-constant-stride.ll     | 116 ++++--------------
 2 files changed, 43 insertions(+), 94 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 6eb253cc5146635..4dc3f6137e3061a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -13803,9 +13804,17 @@ static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,
     Align = std::min(Align, Ld->getAlign());
   }
 
-  using PtrDiff = std::pair<SDValue, bool>;
-  auto GetPtrDiff = [](LoadSDNode *Ld1,
-                       LoadSDNode *Ld2) -> std::optional<PtrDiff> {
+  using PtrDiff = std::pair<std::variant<int64_t, SDValue>, bool>;
+  auto GetPtrDiff = [&DAG](LoadSDNode *Ld1,
+                           LoadSDNode *Ld2) -> std::optional<PtrDiff> {
+    // If the load ptrs can be decomposed into a common (Base + Index) with a
+    // common constant stride, then return the constant stride.
+    BaseIndexOffset BIO1 = BaseIndexOffset::match(Ld1, DAG);
+    BaseIndexOffset BIO2 = BaseIndexOffset::match(Ld2, DAG);
+    if (BIO1.equalBaseIndex(BIO2, DAG))
+      return {{BIO2.getOffset() - BIO1.getOffset(), false}};
+
+    // Otherwise try to match (add LastPtr, Stride) or (add NextPtr, Stride)
     SDValue P1 = Ld1->getBasePtr();
     SDValue P2 = Ld2->getBasePtr();
     if (P2.getOpcode() == ISD::ADD && P2.getOperand(0) == P1)
@@ -13844,7 +13853,11 @@ static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,
   if (!TLI.isLegalStridedLoadStore(WideVecVT, Align))
     return SDValue();
 
-  auto [Stride, MustNegateStride] = *BaseDiff;
+  auto [StrideVariant, MustNegateStride] = *BaseDiff;
+  SDValue Stride = std::holds_alternative<SDValue>(StrideVariant)
+                       ? std::get<SDValue>(StrideVariant)
+                       : DAG.getConstant(std::get<int64_t>(StrideVariant), DL,
+                                         Lds[0]->getOffset().getValueType());
   if (MustNegateStride)
     Stride = DAG.getNegative(Stride, DL, Stride.getValueType());
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/concat-vectors-constant-stride.ll b/llvm/test/CodeGen/RISCV/rvv/concat-vectors-constant-stride.ll
index 611270ab98ebdaf..ff35043dbd7e75e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/concat-vectors-constant-stride.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/concat-vectors-constant-stride.ll
@@ -7,21 +7,10 @@
 define void @constant_forward_stride(ptr %s, ptr %d) {
 ; CHECK-LABEL: constant_forward_stride:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a2, a0, 16
-; CHECK-NEXT:    addi a3, a0, 32
-; CHECK-NEXT:    addi a4, a0, 48
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vle8.v v9, (a2)
-; CHECK-NEXT:    vle8.v v10, (a3)
-; CHECK-NEXT:    vle8.v v11, (a4)
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v11, 6
-; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vlse16.v v8, (a0), a2
+; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    ret
   %1 = getelementptr inbounds i8, ptr %s, i64 16
   %2 = getelementptr inbounds i8, ptr %s, i64 32
@@ -40,21 +29,11 @@ define void @constant_forward_stride(ptr %s, ptr %d) {
 define void @constant_forward_stride2(ptr %s, ptr %d) {
 ; CHECK-LABEL: constant_forward_stride2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a2, a0, -16
-; CHECK-NEXT:    addi a3, a0, -32
-; CHECK-NEXT:    addi a4, a0, -48
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a4)
-; CHECK-NEXT:    vle8.v v9, (a3)
-; CHECK-NEXT:    vle8.v v10, (a2)
-; CHECK-NEXT:    vle8.v v11, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v11, 6
-; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    addi a0, a0, -48
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vlse16.v v8, (a0), a2
+; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    ret
   %1 = getelementptr inbounds i8, ptr %s, i64 -16
   %2 = getelementptr inbounds i8, ptr %s, i64 -32
@@ -73,21 +52,10 @@ define void @constant_forward_stride2(ptr %s, ptr %d) {
 define void @constant_forward_stride3(ptr %s, ptr %d) {
 ; CHECK-LABEL: constant_forward_stride3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a2, a0, 16
-; CHECK-NEXT:    addi a3, a0, 32
-; CHECK-NEXT:    addi a4, a0, 48
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vle8.v v9, (a2)
-; CHECK-NEXT:    vle8.v v10, (a3)
-; CHECK-NEXT:    vle8.v v11, (a4)
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v11, 6
-; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    li a2, 16
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vlse16.v v8, (a0), a2
+; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    ret
   %1 = getelementptr inbounds i8, ptr %s, i64 16
   %2 = getelementptr inbounds i8, ptr %s, i64 32
@@ -109,21 +77,10 @@ define void @constant_forward_stride3(ptr %s, ptr %d) {
 define void @constant_back_stride(ptr %s, ptr %d) {
 ; CHECK-LABEL: constant_back_stride:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a2, a0, -16
-; CHECK-NEXT:    addi a3, a0, -32
-; CHECK-NEXT:    addi a4, a0, -48
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vle8.v v9, (a2)
-; CHECK-NEXT:    vle8.v v10, (a3)
-; CHECK-NEXT:    vle8.v v11, (a4)
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v11, 6
-; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    li a2, -16
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vlse16.v v8, (a0), a2
+; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    ret
   %1 = getelementptr inbounds i8, ptr %s, i64 -16
   %2 = getelementptr inbounds i8, ptr %s, i64 -32
@@ -142,21 +99,11 @@ define void @constant_back_stride(ptr %s, ptr %d) {
 define void @constant_back_stride2(ptr %s, ptr %d) {
 ; CHECK-LABEL: constant_back_stride2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a2, a0, 16
-; CHECK-NEXT:    addi a3, a0, 32
-; CHECK-NEXT:    addi a4, a0, 48
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a4)
-; CHECK-NEXT:    vle8.v v9, (a3)
-; CHECK-NEXT:    vle8.v v10, (a2)
-; CHECK-NEXT:    vle8.v v11, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v11, 6
-; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    addi a0, a0, 48
+; CHECK-NEXT:    li a2, -16
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vlse16.v v8, (a0), a2
+; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    ret
   %1 = getelementptr inbounds i8, ptr %s, i64 16
   %2 = getelementptr inbounds i8, ptr %s, i64 32
@@ -175,21 +122,10 @@ define void @constant_back_stride2(ptr %s, ptr %d) {
 define void @constant_back_stride3(ptr %s, ptr %d) {
 ; CHECK-LABEL: constant_back_stride3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi a2, a0, -16
-; CHECK-NEXT:    addi a3, a0, -32
-; CHECK-NEXT:    addi a4, a0, -48
-; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vle8.v v9, (a2)
-; CHECK-NEXT:    vle8.v v10, (a3)
-; CHECK-NEXT:    vle8.v v11, (a4)
-; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v9, 2
-; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
-; CHECK-NEXT:    vslideup.vi v8, v10, 4
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vi v8, v11, 6
-; CHECK-NEXT:    vse8.v v8, (a1)
+; CHECK-NEXT:    li a2, -16
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vlse16.v v8, (a0), a2
+; CHECK-NEXT:    vse16.v v8, (a1)
 ; CHECK-NEXT:    ret
   %1 = getelementptr inbounds i8, ptr %s, i64 -16
   %2 = getelementptr inbounds i8, ptr %s, i64 -32

>From 3b76bb991a7eda6d40f1276c5739d5ce1fc80980 Mon Sep 17 00:00:00 2001
From: Alexander Shaposhnikov <ashaposhnikov at google.com>
Date: Mon, 16 Oct 2023 23:46:58 +0000
Subject: [PATCH 12/14] [compiler-rt] Fix build of builtins on Windows

Fix Windows build after 910a4bf5b70ae14e
(the breakage was found by the buildbot
https://lab.llvm.org/buildbot/#/builders/127/builds/56796)
---
 compiler-rt/lib/builtins/fp_extend.h | 21 ++++++++++++++-------
 compiler-rt/lib/builtins/fp_trunc.h  | 24 ++++++++++++++++--------
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/compiler-rt/lib/builtins/fp_extend.h b/compiler-rt/lib/builtins/fp_extend.h
index 86b32be12d55fc3..d640bdcb0ec1fa4 100644
--- a/compiler-rt/lib/builtins/fp_extend.h
+++ b/compiler-rt/lib/builtins/fp_extend.h
@@ -23,7 +23,8 @@ typedef uint32_t src_rep_t;
 static const int srcBits = sizeof(src_t) * CHAR_BIT;
 static const int srcSigFracBits = 23;
 // -1 accounts for the sign bit.
-static const int srcExpBits = srcBits - srcSigFracBits - 1;
+// srcBits - srcSigFracBits - 1
+static const int srcExpBits = 8;
 #define src_rep_t_clz clzsi
 
 #elif defined SRC_DOUBLE
@@ -33,7 +34,8 @@ typedef uint64_t src_rep_t;
 static const int srcBits = sizeof(src_t) * CHAR_BIT;
 static const int srcSigFracBits = 52;
 // -1 accounts for the sign bit.
-static const int srcExpBits = srcBits - srcSigFracBits - 1;
+// srcBits - srcSigFracBits - 1
+static const int srcExpBits = 11;
 
 static inline int src_rep_t_clz_impl(src_rep_t a) {
 #if defined __LP64__
@@ -56,7 +58,8 @@ static const int srcBits = 80;
 static const int srcSigFracBits = 63;
 // -1 accounts for the sign bit.
 // -1 accounts for the explicitly stored integer bit.
-static const int srcExpBits = srcBits - srcSigFracBits - 1 - 1;
+// srcBits - srcSigFracBits - 1 - 1
+static const int srcExpBits = 15;
 
 #elif defined SRC_HALF
 #ifdef COMPILER_RT_HAS_FLOAT16
@@ -69,7 +72,8 @@ typedef uint16_t src_rep_t;
 static const int srcBits = sizeof(src_t) * CHAR_BIT;
 static const int srcSigFracBits = 10;
 // -1 accounts for the sign bit.
-static const int srcExpBits = srcBits - srcSigFracBits - 1;
+// srcBits - srcSigFracBits - 1
+static const int srcExpBits = 5;
 
 #define src_rep_t_clz __builtin_clz
 
@@ -84,7 +88,8 @@ typedef uint32_t dst_rep_t;
 static const int dstBits = sizeof(dst_t) * CHAR_BIT;
 static const int dstSigFracBits = 23;
 // -1 accounts for the sign bit.
-static const int dstExpBits = dstBits - dstSigFracBits - 1;
+// dstBits - dstSigFracBits - 1
+static const int dstExpBits = 8;
 
 #elif defined DST_DOUBLE
 typedef double dst_t;
@@ -93,7 +98,8 @@ typedef uint64_t dst_rep_t;
 static const int dstBits = sizeof(dst_t) * CHAR_BIT;
 static const int dstSigFracBits = 52;
 // -1 accounts for the sign bit.
-static const int dstExpBits = dstBits - dstSigFracBits - 1;
+// dstBits - dstSigFracBits - 1
+static const int dstExpBits = 11;
 
 #elif defined DST_QUAD
 // TODO: use fp_lib.h once QUAD_PRECISION is available on x86_64.
@@ -108,7 +114,8 @@ typedef __uint128_t dst_rep_t;
 static const int dstBits = sizeof(dst_t) * CHAR_BIT;
 static const int dstSigFracBits = 112;
 // -1 accounts for the sign bit.
-static const int dstExpBits = dstBits - dstSigFracBits - 1;
+// dstBits - dstSigFracBits - 1
+static const int dstExpBits = 15;
 
 #else
 #error Destination should be single, double, or quad precision!
diff --git a/compiler-rt/lib/builtins/fp_trunc.h b/compiler-rt/lib/builtins/fp_trunc.h
index ea13dc2efae5411..f62f8bafc7995f7 100644
--- a/compiler-rt/lib/builtins/fp_trunc.h
+++ b/compiler-rt/lib/builtins/fp_trunc.h
@@ -22,7 +22,8 @@ typedef uint32_t src_rep_t;
 static const int srcBits = sizeof(src_t) * CHAR_BIT;
 static const int srcSigFracBits = 23;
 // -1 accounts for the sign bit.
-static const int srcExpBits = srcBits - srcSigFracBits - 1;
+// srcBits - srcSigFracBits - 1
+static const int srcExpBits = 8;
 
 #elif defined SRC_DOUBLE
 typedef double src_t;
@@ -31,7 +32,8 @@ typedef uint64_t src_rep_t;
 static const int srcBits = sizeof(src_t) * CHAR_BIT;
 static const int srcSigFracBits = 52;
 // -1 accounts for the sign bit.
-static const int srcExpBits = srcBits - srcSigFracBits - 1;
+// srcBits - srcSigFracBits - 1
+static const int srcExpBits = 11;
 
 #elif defined SRC_QUAD
 // TODO: use fp_lib.h once QUAD_PRECISION is available on x86_64.
@@ -46,7 +48,8 @@ typedef __uint128_t src_rep_t;
 static const int srcBits = sizeof(src_t) * CHAR_BIT;
 static const int srcSigFracBits = 112;
 // -1 accounts for the sign bit.
-static const int srcExpBits = srcBits - srcSigFracBits - 1;
+// srcBits - srcSigFracBits - 1
+static const int srcExpBits = 15;
 
 #else
 #error Source should be double precision or quad precision!
@@ -59,7 +62,8 @@ typedef uint64_t dst_rep_t;
 static const int dstBits = sizeof(dst_t) * CHAR_BIT;
 static const int dstSigFracBits = 52;
 // -1 accounts for the sign bit.
-static const int dstExpBits = dstBits - dstSigFracBits - 1;
+// dstBits - dstSigFracBits - 1
+static const int dstExpBits = 11;
 
 #elif defined DST_80
 typedef long double dst_t;
@@ -69,7 +73,8 @@ static const int dstBits = 80;
 static const int dstSigFracBits = 63;
 // -1 accounts for the sign bit.
 // -1 accounts for the explicitly stored integer bit.
-static const int dstExpBits = dstBits - dstSigFracBits - 1 - 1;
+// dstBits - dstSigFracBits - 1 - 1
+static const int dstExpBits = 15;
 
 #elif defined DST_SINGLE
 typedef float dst_t;
@@ -78,7 +83,8 @@ typedef uint32_t dst_rep_t;
 static const int dstBits = sizeof(dst_t) * CHAR_BIT;
 static const int dstSigFracBits = 23;
 // -1 accounts for the sign bit.
-static const int dstExpBits = dstBits - dstSigFracBits - 1;
+// dstBits - dstSigFracBits - 1
+static const int dstExpBits = 8;
 
 #elif defined DST_HALF
 #ifdef COMPILER_RT_HAS_FLOAT16
@@ -91,7 +97,8 @@ typedef uint16_t dst_rep_t;
 static const int dstBits = sizeof(dst_t) * CHAR_BIT;
 static const int dstSigFracBits = 10;
 // -1 accounts for the sign bit.
-static const int dstExpBits = dstBits - dstSigFracBits - 1;
+// dstBits - dstSigFracBits - 1
+static const int dstExpBits = 5;
 
 #elif defined DST_BFLOAT
 typedef __bf16 dst_t;
@@ -100,7 +107,8 @@ typedef uint16_t dst_rep_t;
 static const int dstBits = sizeof(dst_t) * CHAR_BIT;
 static const int dstSigFracBits = 7;
 // -1 accounts for the sign bit.
-static const int dstExpBits = dstBits - dstSigFracBits - 1;
+// dstBits - dstSigFracBits - 1
+static const int dstExpBits = 8;
 
 #else
 #error Destination should be single precision or double precision!

>From db6ada80955f6f57a0785187d511e7f6c3ecdde9 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler at users.noreply.github.com>
Date: Mon, 16 Oct 2023 16:51:46 -0700
Subject: [PATCH 13/14] [flang][NFC] Speed up large DATA statement
 initializations (#67585)

To ensure that the map from symbols to their initial images has an entry
for a particular symbol, use std::map<>::find() before
std::map<>::emplace() to avoid needless memory allocation and
deallocation. Also, combine adjacent intervals in the lists of
initialized ranges so that contiguous initializations don't require long
lists.

Fixes https://github.com/llvm/llvm-project/issues/66452.
---
 flang/lib/Semantics/data-to-inits.cpp | 29 +++++++++++++++------------
 flang/lib/Semantics/data-to-inits.h   | 16 +++++++++++++++
 2 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/flang/lib/Semantics/data-to-inits.cpp b/flang/lib/Semantics/data-to-inits.cpp
index bc0355a2c597a6f..85bce874e78cdeb 100644
--- a/flang/lib/Semantics/data-to-inits.cpp
+++ b/flang/lib/Semantics/data-to-inits.cpp
@@ -81,7 +81,7 @@ template <typename DSV = parser::DataStmtValue> class ValueListIterator {
 };
 
 template <typename DSV> void ValueListIterator<DSV>::SetRepetitionCount() {
-  for (repetitionsRemaining_ = 1; at_ != end_; ++at_) {
+  for (; at_ != end_; ++at_) {
     auto repetitions{GetValue().repetitions};
     if (repetitions < 0) {
       hasFatalError_ = true;
@@ -335,10 +335,15 @@ bool DataInitializationCompiler<DSV>::InitElement(
     }
   }};
   const auto GetImage{[&]() -> evaluate::InitialImage & {
-    auto iter{inits_.emplace(&symbol, symbol.size())};
-    auto &symbolInit{iter.first->second};
-    symbolInit.initializedRanges.emplace_back(
-        offsetSymbol.offset(), offsetSymbol.size());
+    // This could be (and was) written to always call std::map<>::emplace(),
+    // which should handle duplicate entries gracefully, but it was still
+    // causing memory allocation & deallocation with gcc.
+    auto iter{inits_.find(&symbol)};
+    if (iter == inits_.end()) {
+      iter = inits_.emplace(&symbol, symbol.size()).first;
+    }
+    auto &symbolInit{iter->second};
+    symbolInit.NoteInitializedRange(offsetSymbol);
     return symbolInit.image;
   }};
   const auto OutOfRangeError{[&]() {
@@ -590,8 +595,7 @@ static void PopulateWithComponentDefaults(SymbolDataInitialization &init,
           }
         }
         if (initialized) {
-          init.initializedRanges.emplace_back(
-              componentOffset, component.size());
+          init.NoteInitializedRange(componentOffset, component.size());
         }
       }
     } else if (const auto *proc{component.detailsIf<ProcEntityDetails>()}) {
@@ -599,8 +603,7 @@ static void PopulateWithComponentDefaults(SymbolDataInitialization &init,
         SomeExpr procPtrInit{evaluate::ProcedureDesignator{**proc->init()}};
         auto extant{init.image.AsConstantPointer(componentOffset)};
         if (!extant || !(*extant == procPtrInit)) {
-          init.initializedRanges.emplace_back(
-              componentOffset, component.size());
+          init.NoteInitializedRange(componentOffset, component.size());
           init.image.AddPointer(componentOffset, std::move(procPtrInit));
         }
       }
@@ -651,7 +654,7 @@ static void IncorporateExplicitInitialization(
   if (iter != inits.end()) { // DATA statement initialization
     for (const auto &range : iter->second.initializedRanges) {
       auto at{offset + range.start()};
-      combined.initializedRanges.emplace_back(at, range.size());
+      combined.NoteInitializedRange(at, range.size());
       combined.image.Incorporate(
           at, iter->second.image, range.start(), range.size());
     }
@@ -663,7 +666,7 @@ static void IncorporateExplicitInitialization(
     if (IsPointer(mutableSymbol)) {
       if (auto *object{mutableSymbol.detailsIf<ObjectEntityDetails>()}) {
         if (object->init()) {
-          combined.initializedRanges.emplace_back(offset, mutableSymbol.size());
+          combined.NoteInitializedRange(offset, mutableSymbol.size());
           combined.image.AddPointer(offset, *object->init());
           if (removeOriginalInits) {
             object->init().reset();
@@ -671,7 +674,7 @@ static void IncorporateExplicitInitialization(
         }
       } else if (auto *proc{mutableSymbol.detailsIf<ProcEntityDetails>()}) {
         if (proc->init() && *proc->init()) {
-          combined.initializedRanges.emplace_back(offset, mutableSymbol.size());
+          combined.NoteInitializedRange(offset, mutableSymbol.size());
           combined.image.AddPointer(
               offset, SomeExpr{evaluate::ProcedureDesignator{**proc->init()}});
           if (removeOriginalInits) {
@@ -681,7 +684,7 @@ static void IncorporateExplicitInitialization(
       }
     } else if (auto *object{mutableSymbol.detailsIf<ObjectEntityDetails>()}) {
       if (!IsNamedConstant(mutableSymbol) && object->init()) {
-        combined.initializedRanges.emplace_back(offset, mutableSymbol.size());
+        combined.NoteInitializedRange(offset, mutableSymbol.size());
         combined.image.Add(
             offset, mutableSymbol.size(), *object->init(), foldingContext);
         if (removeOriginalInits) {
diff --git a/flang/lib/Semantics/data-to-inits.h b/flang/lib/Semantics/data-to-inits.h
index 10d850d23d5d636..d8cc4601de26fa9 100644
--- a/flang/lib/Semantics/data-to-inits.h
+++ b/flang/lib/Semantics/data-to-inits.h
@@ -11,6 +11,7 @@
 
 #include "flang/Common/default-kinds.h"
 #include "flang/Common/interval.h"
+#include "flang/Evaluate/fold-designator.h"
 #include "flang/Evaluate/initial-image.h"
 #include <list>
 #include <map>
@@ -30,6 +31,21 @@ struct SymbolDataInitialization {
   using Range = common::Interval<common::ConstantSubscript>;
   explicit SymbolDataInitialization(std::size_t bytes) : image{bytes} {}
   SymbolDataInitialization(SymbolDataInitialization &&) = default;
+
+  void NoteInitializedRange(Range range) {
+    if (initializedRanges.empty() ||
+        !initializedRanges.back().AnnexIfPredecessor(range)) {
+      initializedRanges.emplace_back(range);
+    }
+  }
+  void NoteInitializedRange(
+      common::ConstantSubscript offset, std::size_t size) {
+    NoteInitializedRange(Range{offset, size});
+  }
+  void NoteInitializedRange(evaluate::OffsetSymbol offsetSymbol) {
+    NoteInitializedRange(offsetSymbol.offset(), offsetSymbol.size());
+  }
+
   evaluate::InitialImage image;
   std::list<Range> initializedRanges;
 };

>From f1845b42d4e0ce5a3aecc59e10a96360eac875ae Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston at google.com>
Date: Mon, 16 Oct 2023 23:53:59 +0000
Subject: [PATCH 14/14] Whitespace fix

---
 compiler-rt/test/hwasan/TestCases/deep-recursion.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/test/hwasan/TestCases/deep-recursion.c b/compiler-rt/test/hwasan/TestCases/deep-recursion.c
index c992f205917ba22..9a43fb419a39fc1 100644
--- a/compiler-rt/test/hwasan/TestCases/deep-recursion.c
+++ b/compiler-rt/test/hwasan/TestCases/deep-recursion.c
@@ -35,7 +35,7 @@ __attribute__((noinline)) void OOB() {
   // around it: if the tag is zero, we use the neighboring variable instead,
   // which must have a different (hence non-zero) tag.
   // This tag check assumes aarch64.
-  if(((unsigned long)&x) >> 56 == 0) {
+  if (((unsigned long)&x) >> 56 == 0) {
     y[four] = 0;
   } else {
     x[four] = 0;