[flang-commits] [flang] Lower std::string's alignment requirement from 16 to 8. (PR #68807)

Thu Oct 12 10:03:24 PDT 2023

https://github.com/EricWF updated https://github.com/llvm/llvm-project/pull/68807

>From e5ad214dc3070039f4860e5bcb363211fe899066 Mon Sep 17 00:00:00 2001
From: Eric Fiselier <eric at efcs.ca>
Date: Wed, 11 Oct 2023 10:36:28 -0400
Subject: [PATCH 01/38] Lower std::string's alignment requirement from 16 to 8.

This allows smaller allocations to occur, closer to the actual std::string's required size. This is particularly effective in decreasing the allocation size upon initial construction (where __recommend is called to determine the size).

Although the memory savings per-string are never more than 8 bytes per string initially, this quickly adds up. And has lead to not insigficant memory savings at Google.

Unfortunately, this change is ABI breaking because it changes the value returned by max_size. So it has to be guarded.
---
 libcxx/include/__config                       |  5 +++
 libcxx/include/string                         |  9 +++-
 .../string.capacity/allocation_size.pass.cpp  | 42 +++++++++++++++++++
 .../string.capacity/max_size.pass.cpp         |  9 +++-
 4 files changed, 62 insertions(+), 3 deletions(-)
 create mode 100644 libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp

diff --git a/libcxx/include/__config b/libcxx/include/__config
index 55d9f1c737652e6..3b0a70931c9eec8 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -167,6 +167,11 @@
 // The implementation moved to the header, but we still export the symbols from
 // the dylib for backwards compatibility.
 #    define _LIBCPP_ABI_DO_NOT_EXPORT_TO_CHARS_BASE_10
+// Save memory by providing the allocator more freedom to allocate the most
+// efficient size class by dropping the alignment requirements for std::string's
+// pointer from 16 to 8. This changes the output of std::string::max_size,
+// which makes it ABI breaking
+#   define _LIBCPP_ABI_STRING_8_BYTE_ALIGNMENT
 #  elif _LIBCPP_ABI_VERSION == 1
 #    if !(defined(_LIBCPP_OBJECT_FORMAT_COFF) || defined(_LIBCPP_OBJECT_FORMAT_XCOFF))
 // Enable compiling copies of now inline methods into the dylib to support
diff --git a/libcxx/include/string b/libcxx/include/string
index 33e87406a1156a6..3078715e02b358a 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -1851,7 +1851,14 @@ private:
         _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
         size_type __align_it(size_type __s) _NOEXCEPT
             {return (__s + (__a-1)) & ~(__a-1);}
-    enum {__alignment = 16};
+    enum {
+      __alignment =
+#ifdef _LIBCPP_ABI_STRING_8_BYTE_ALIGNMENT
+      8
+#else
+      16
+#endif
+    };
     static _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20
     size_type __recommend(size_type __s) _NOEXCEPT
     {
diff --git a/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp
new file mode 100644
index 000000000000000..bb0976e29509924
--- /dev/null
+++ b/libcxx/test/libcxx/strings/basic.string/string.capacity/allocation_size.pass.cpp
@@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <string>
+
+// This test demonstrates the smaller allocation sizes when the alignment
+// requirements of std::string are dropped from 16 to 8.
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <string>
+
+#include "test_macros.h"
+
+// alignment of the string heap buffer is hardcoded to either 8 or 16
+constexpr std::size_t alignment =
+#ifdef _LIBCPP_ABI_STRING_8_BYTE_ALIGNMENT
+    8;
+#else
+    16;
+#endif
+
+int main(int, char**) {
+  std::string input_string;
+  input_string.resize(64, 'a');
+
+  // Call a constructor which selects its size using __recommend.
+  std::string test_string(input_string.data());
+  constexpr std::size_t expected_align8_size = 71;
+
+  // Demonstrate the lesser capacity/allocation size when the alignment requirement is 8.
+  if (alignment == 8) {
+    assert(test_string.capacity() == expected_align8_size);
+  } else {
+    assert(test_string.capacity() == expected_align8_size + 8);
+  }
+}
diff --git a/libcxx/test/libcxx/strings/basic.string/string.capacity/max_size.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.capacity/max_size.pass.cpp
index 5af9cab0be4e80a..ca32376ad8fcc08 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.capacity/max_size.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.capacity/max_size.pass.cpp
@@ -17,8 +17,13 @@
 
 #include "test_macros.h"
 
-// alignment of the string heap buffer is hardcoded to 16
-static const std::size_t alignment = 16;
+// alignment of the string heap buffer is hardcoded to either 8 or 16
+static const std::size_t alignment =
+#ifdef _LIBCPP_ABI_STRING_8_BYTE_ALIGNMENT
+    8;
+#else
+    16;
+#endif
 
 template <class = int>
 TEST_CONSTEXPR_CXX20 void full_size() {

>From 0c0df584df43f5beeba0d2df21388360a331f841 Mon Sep 17 00:00:00 2001
From: Eric Fiselier <eric at efcs.ca>
Date: Wed, 11 Oct 2023 14:49:13 -0400
Subject: [PATCH 02/38] Add release notes for alignment change

---
 libcxx/docs/ReleaseNotes/18.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index 5f43d2f2afe22d3..b7fe7d19141bbe5 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -133,6 +133,10 @@ ABI Affecting Changes
   results in an ABI break, however in practice we expect uses of ``std::projected`` in ABI-sensitive places to be
   extremely rare. Any error resulting from this change should result in a link-time error.
 
+- When enabled, std::string no longer rounds allocation sizes up to 16 bytes. This
+  is a breaking change as it can cause different exceptions to be thrown when
+  a very large string is created.
+
 Build System Changes
 --------------------
 

>From 0c9546e8cd8830b0dcc0474160e836dfdbe2ce05 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Wed, 13 Sep 2023 11:25:01 +0000
Subject: [PATCH 03/38] [flang][FIR] add FIR TBAA pass

See RFC at
https://discourse.llvm.org/t/rfc-propagate-fir-alias-analysis-information-using-tbaa/73755

This pass adds TBAA tags to all accesses to non-pointer/target dummy
arguments. These TBAA tags tell LLVM that these accesses cannot alias:
allowing better dead code elimination, hoisting out of loops, and
vectorization.

Each function has its own TBAA tree so that accesses between funtions
MayAlias after inlining.

I also included code for adding tags for local allocations and for
global variables. Enabling all three kinds of tag is known to produce a
miscompile and so these are disabled by default. But it isn't much
code and I thought it could be interesting to play with these later if
one is looking at a benchmark which looks like it would benefit from
more alias information. I'm open to removing this code too.

TBAA tags are also added separately by TBAABuilder during CodeGen.
TBAABuilder has to run during CodeGen because it adds tags to box
accesses, many of which are implicit in FIR. This pass cannot (easily)
run in CodeGen because fir::AliasAnalysis has difficulty tracing values
between blocks, and by the time CodeGen runs, structured control flow
has already been lowered.

Coming in follow up patches
  - Change CodeGen/TBAABuilder to use TBAAForest to add tags within the
    same per-function trees as are used here (delayed to a later patch
    to make it easier to revert)
  - Command line argument processing to actually enable the pass
---
 .../flang/Optimizer/Analysis/TBAAForest.h     | 104 +++++
 .../flang/Optimizer/Transforms/Passes.h       |   1 +
 .../flang/Optimizer/Transforms/Passes.td      |  20 +
 flang/lib/Optimizer/Analysis/CMakeLists.txt   |   1 +
 flang/lib/Optimizer/Analysis/TBAAForest.cpp   |  60 +++
 .../lib/Optimizer/Transforms/AddAliasTags.cpp | 210 ++++++++++
 flang/lib/Optimizer/Transforms/CMakeLists.txt |   1 +
 flang/test/Transforms/tbaa.fir                | 175 ++++++++
 flang/test/Transforms/tbaa2.fir               | 386 ++++++++++++++++++
 9 files changed, 958 insertions(+)
 create mode 100644 flang/include/flang/Optimizer/Analysis/TBAAForest.h
 create mode 100644 flang/lib/Optimizer/Analysis/TBAAForest.cpp
 create mode 100644 flang/lib/Optimizer/Transforms/AddAliasTags.cpp
 create mode 100644 flang/test/Transforms/tbaa.fir
 create mode 100644 flang/test/Transforms/tbaa2.fir

diff --git a/flang/include/flang/Optimizer/Analysis/TBAAForest.h b/flang/include/flang/Optimizer/Analysis/TBAAForest.h
new file mode 100644
index 000000000000000..a024544e50ef98b
--- /dev/null
+++ b/flang/include/flang/Optimizer/Analysis/TBAAForest.h
@@ -0,0 +1,104 @@
+//===-- TBAAForest.h - A TBAA tree for each function -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_ANALYSIS_TBAA_FOREST_H
+#define FORTRAN_OPTIMIZER_ANALYSIS_TBAA_FOREST_H
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/MLIRContext.h"
+#include "llvm/ADT/DenseMap.h"
+#include <string>
+
+namespace fir {
+
+//===----------------------------------------------------------------------===//
+// TBAATree
+//===----------------------------------------------------------------------===//
+/// Per-function TBAA tree. Each tree contains branches for data (of various
+/// kinds) and descriptor access
+struct TBAATree {
+  //===----------------------------------------------------------------------===//
+  // TBAAForrest::TBAATree::SubtreeState
+  //===----------------------------------------------------------------------===//
+  /// This contains a TBAA subtree based on some parent. New tags can be added
+  /// under the parent using getTag.
+  class SubtreeState {
+    friend TBAATree; // only allow construction by TBAATree
+  public:
+    SubtreeState() = delete;
+    SubtreeState(const SubtreeState &) = delete;
+    SubtreeState(SubtreeState &&) = default;
+
+    mlir::LLVM::TBAATagAttr getTag(llvm::StringRef uniqueId) const;
+
+  private:
+    SubtreeState(mlir::MLIRContext *ctx, std::string name,
+                 mlir::LLVM::TBAANodeAttr grandParent)
+        : parentId{std::move(name)}, context(ctx) {
+      parent = mlir::LLVM::TBAATypeDescriptorAttr::get(
+          context, parentId, mlir::LLVM::TBAAMemberAttr::get(grandParent, 0));
+    }
+
+    const std::string parentId;
+    mlir::MLIRContext *const context;
+    mlir::LLVM::TBAATypeDescriptorAttr parent;
+    llvm::DenseMap<llvm::StringRef, mlir::LLVM::TBAATagAttr> tagDedup;
+  };
+
+  SubtreeState globalDataTree;
+  SubtreeState allocatedDataTree;
+  SubtreeState dummyArgDataTree;
+  mlir::LLVM::TBAATypeDescriptorAttr anyAccessDesc;
+  mlir::LLVM::TBAATypeDescriptorAttr boxMemberTypeDesc;
+  mlir::LLVM::TBAATypeDescriptorAttr anyDataTypeDesc;
+
+  static TBAATree buildTree(mlir::StringAttr functionName);
+
+private:
+  TBAATree(mlir::LLVM::TBAATypeDescriptorAttr anyAccess,
+           mlir::LLVM::TBAATypeDescriptorAttr dataRoot,
+           mlir::LLVM::TBAATypeDescriptorAttr boxMemberTypeDesc);
+};
+
+//===----------------------------------------------------------------------===//
+// TBAAForrest
+//===----------------------------------------------------------------------===//
+/// Collection of TBAATrees, usually indexed by function (so that each function
+/// has a different TBAATree)
+class TBAAForrest {
+public:
+  explicit TBAAForrest(bool separatePerFunction = true)
+      : separatePerFunction{separatePerFunction} {}
+
+  inline const TBAATree &operator[](mlir::func::FuncOp func) {
+    return getFuncTree(func.getSymNameAttr());
+  }
+  inline const TBAATree &operator[](mlir::LLVM::LLVMFuncOp func) {
+    return getFuncTree(func.getSymNameAttr());
+  }
+
+private:
+  const TBAATree &getFuncTree(mlir::StringAttr symName) {
+    if (!separatePerFunction)
+      symName = mlir::StringAttr::get(symName.getContext(), "");
+    if (!trees.contains(symName))
+      trees.insert({symName, TBAATree::buildTree(symName)});
+    return trees.at(symName);
+  }
+
+  // Should each function use a different tree?
+  const bool separatePerFunction;
+  // TBAA tree per function
+  llvm::DenseMap<mlir::StringAttr, TBAATree> trees;
+};
+
+} // namespace fir
+
+#endif // FORTRAN_OPTIMIZER_ANALYSIS_TBAA_FOREST_H
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index 64882c8ec406b0a..30d97be3800c191 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -61,6 +61,7 @@ std::unique_ptr<mlir::Pass> createMemDataFlowOptPass();
 std::unique_ptr<mlir::Pass> createPromoteToAffinePass();
 std::unique_ptr<mlir::Pass> createMemoryAllocationPass();
 std::unique_ptr<mlir::Pass> createStackArraysPass();
+std::unique_ptr<mlir::Pass> createAliasTagsPass();
 std::unique_ptr<mlir::Pass> createSimplifyIntrinsicsPass();
 std::unique_ptr<mlir::Pass> createAddDebugFoundationPass();
 std::unique_ptr<mlir::Pass> createLoopVersioningPass();
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 80da485392007fa..6d211a535b53f70 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -252,6 +252,26 @@ def StackArrays : Pass<"stack-arrays", "mlir::ModuleOp"> {
   let constructor = "::fir::createStackArraysPass()";
 }
 
+def AddAliasTags : Pass<"fir-add-alias-tags", "mlir::ModuleOp"> {
+  let summary = "Add tbaa tags to operations that implement FirAliasAnalysisOpInterface";
+  let description = [{
+    TBAA (type based alias analysis) is one method to pass pointer alias information
+    from language frontends to LLVM. This pass uses fir::AliasAnalysis to add this
+    information to fir.load and fir.store operations.
+    Additional tags are added during codegen. See fir::TBAABuilder.
+    This needs to be a separate pass so that it happens before structured control
+    flow operations are lowered to branches and basic blocks (this makes tracing
+    the source of values much eaiser). The other TBAA tags need to be applied to
+    box loads and stores which are implicit in FIR and so cannot be annotated
+    until codegen.
+    TODO: this is currently a pass on mlir::ModuleOp to avoid parallelism. In
+    theory, each operation could be considered in prallel, so long as there
+    aren't races adding new tags to the mlir context.
+  }];
+  let dependentDialects = [ "fir::FIROpsDialect" ];
+  let constructor = "::fir::createAliasTagsPass()";
+}
+
 def SimplifyRegionLite : Pass<"simplify-region-lite", "mlir::ModuleOp"> {
   let summary = "Region simplification";
   let description = [{
diff --git a/flang/lib/Optimizer/Analysis/CMakeLists.txt b/flang/lib/Optimizer/Analysis/CMakeLists.txt
index 19dadf72cf4ce14..436d4d3f18969c1 100644
--- a/flang/lib/Optimizer/Analysis/CMakeLists.txt
+++ b/flang/lib/Optimizer/Analysis/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_flang_library(FIRAnalysis
   AliasAnalysis.cpp
+  TBAAForest.cpp
 
   DEPENDS
   FIRDialect
diff --git a/flang/lib/Optimizer/Analysis/TBAAForest.cpp b/flang/lib/Optimizer/Analysis/TBAAForest.cpp
new file mode 100644
index 000000000000000..070e2be6700cc11
--- /dev/null
+++ b/flang/lib/Optimizer/Analysis/TBAAForest.cpp
@@ -0,0 +1,60 @@
+//===- TBAAForest.cpp - Per-functon TBAA Trees ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Analysis/TBAAForest.h"
+#include <mlir/Dialect/LLVMIR/LLVMAttrs.h>
+
+mlir::LLVM::TBAATagAttr
+fir::TBAATree::SubtreeState::getTag(llvm::StringRef uniqueName) const {
+  // mlir::LLVM::TBAATagAttr &tag = tagDedup[uniqueName];
+  // if (tag)
+  //   return tag;
+  std::string id = (parentId + "/" + uniqueName).str();
+  mlir::LLVM::TBAATypeDescriptorAttr type =
+      mlir::LLVM::TBAATypeDescriptorAttr::get(
+          context, id, mlir::LLVM::TBAAMemberAttr::get(parent, 0));
+  return mlir::LLVM::TBAATagAttr::get(type, type, 0);
+  // return tag;
+}
+
+fir::TBAATree fir::TBAATree::buildTree(mlir::StringAttr func) {
+  llvm::StringRef funcName = func.getValue();
+  std::string rootId = ("Flang function root " + funcName).str();
+  mlir::MLIRContext *ctx = func.getContext();
+  mlir::LLVM::TBAARootAttr funcRoot =
+      mlir::LLVM::TBAARootAttr::get(ctx, mlir::StringAttr::get(ctx, rootId));
+
+  static constexpr llvm::StringRef anyAccessTypeDescId = "any access";
+  mlir::LLVM::TBAATypeDescriptorAttr anyAccess =
+      mlir::LLVM::TBAATypeDescriptorAttr::get(
+          ctx, anyAccessTypeDescId,
+          mlir::LLVM::TBAAMemberAttr::get(funcRoot, 0));
+
+  static constexpr llvm::StringRef anyDataAccessTypeDescId = "any data access";
+  mlir::LLVM::TBAATypeDescriptorAttr dataRoot =
+      mlir::LLVM::TBAATypeDescriptorAttr::get(
+          ctx, anyDataAccessTypeDescId,
+          mlir::LLVM::TBAAMemberAttr::get(anyAccess, 0));
+
+  static constexpr llvm::StringRef boxMemberTypeDescId = "descriptor member";
+  mlir::LLVM::TBAATypeDescriptorAttr boxMemberTypeDesc =
+      mlir::LLVM::TBAATypeDescriptorAttr::get(
+          ctx, boxMemberTypeDescId,
+          mlir::LLVM::TBAAMemberAttr::get(anyAccess, 0));
+
+  return TBAATree{anyAccess, dataRoot, boxMemberTypeDesc};
+}
+
+fir::TBAATree::TBAATree(mlir::LLVM::TBAATypeDescriptorAttr anyAccess,
+                        mlir::LLVM::TBAATypeDescriptorAttr dataRoot,
+                        mlir::LLVM::TBAATypeDescriptorAttr boxMemberTypeDesc)
+    : globalDataTree(dataRoot.getContext(), "global data", dataRoot),
+      allocatedDataTree(dataRoot.getContext(), "allocated data", dataRoot),
+      dummyArgDataTree(dataRoot.getContext(), "dummy arg data", dataRoot),
+      anyAccessDesc(anyAccess), boxMemberTypeDesc(boxMemberTypeDesc),
+      anyDataTypeDesc(dataRoot) {}
diff --git a/flang/lib/Optimizer/Transforms/AddAliasTags.cpp b/flang/lib/Optimizer/Transforms/AddAliasTags.cpp
new file mode 100644
index 000000000000000..25439837acac518
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/AddAliasTags.cpp
@@ -0,0 +1,210 @@
+//===- AddAliasTags.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+/// \file
+/// Adds TBAA alias tags to fir loads and stores, based on information from
+/// fir::AliasAnalysis. More are added later in CodeGen - see fir::TBAABuilder
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Analysis/AliasAnalysis.h"
+#include "flang/Optimizer/Analysis/TBAAForest.h"
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FirAliasTagOpInterface.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <optional>
+
+namespace fir {
+#define GEN_PASS_DEF_ADDALIASTAGS
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+#define DEBUG_TYPE "fir-add-alias-tags"
+
+static llvm::cl::opt<bool>
+    enableDummyArgs("dummy-arg-tbaa", llvm::cl::init(true), llvm::cl::Hidden,
+                    llvm::cl::desc("Add TBAA tags to dummy arguments"));
+// These two are **known unsafe** (misscompare in spec2017/wrf_r). They should
+// not be enabled by default.
+// The code is kept so that these may be tried with new benchmarks to see if
+// this is worth fixing in the future.
+static llvm::cl::opt<bool>
+    enableGlobals("globals-tbaa", llvm::cl::init(false), llvm::cl::Hidden,
+                  llvm::cl::desc("Add TBAA tags to global variables. UNSAFE."));
+static llvm::cl::opt<bool> enableLocalAllocs(
+    "local-alloc-tbaa", llvm::cl::init(false), llvm::cl::Hidden,
+    llvm::cl::desc("Add TBAA tags to local allocations. UNSAFE."));
+
+namespace {
+
+/// Shared state per-module
+class PassState {
+public:
+  /// memoised call to fir::AliasAnalysis::getSource
+  inline const fir::AliasAnalysis::Source &getSource(mlir::Value value) {
+    if (!analysisCache.contains(value))
+      analysisCache.insert({value, analysis.getSource(value)});
+    return analysisCache[value];
+  }
+
+  /// get the per-function TBAATree for this function
+  inline const fir::TBAATree &getFuncTree(mlir::func::FuncOp func) {
+    return forrest[func];
+  }
+
+private:
+  fir::AliasAnalysis analysis;
+  llvm::DenseMap<mlir::Value, fir::AliasAnalysis::Source> analysisCache;
+  fir::TBAAForrest forrest;
+};
+
+class AddAliasTagsPass : public fir::impl::AddAliasTagsBase<AddAliasTagsPass> {
+public:
+  void runOnOperation() override;
+
+private:
+  /// The real workhorse of the pass. This is a runOnOperation() which
+  /// operates on fir::FirAliasTagOpInterface, using some extra state
+  void runOnAliasInterface(fir::FirAliasTagOpInterface op, PassState &state);
+};
+
+} // namespace
+
+static fir::DeclareOp getDeclareOp(mlir::Value arg) {
+  for (mlir::Operation *use : arg.getUsers())
+    if (fir::DeclareOp declare = mlir::dyn_cast<fir::DeclareOp>(use))
+      return declare;
+  return nullptr;
+}
+
+/// Get the name of a function argument using the "fir.bindc_name" attribute,
+/// or ""
+static std::string getFuncArgName(mlir::Value arg) {
+  // first try getting the name from the hlfir.declare
+  if (fir::DeclareOp declare = getDeclareOp(arg))
+    return declare.getUniqName().str();
+
+  // get from attribute on function argument
+  // always succeeds because arg is a function argument
+  mlir::BlockArgument blockArg = mlir::cast<mlir::BlockArgument>(arg);
+  assert(blockArg.getOwner() && blockArg.getOwner()->isEntryBlock() &&
+         "arg is a function argument");
+  mlir::FunctionOpInterface func = mlir::dyn_cast<mlir::FunctionOpInterface>(
+      blockArg.getOwner()->getParentOp());
+  mlir::StringAttr attr = func.getArgAttrOfType<mlir::StringAttr>(
+      blockArg.getArgNumber(), "fir.bindc_name");
+  if (!attr)
+    return "";
+  return attr.str();
+}
+
+void AddAliasTagsPass::runOnAliasInterface(fir::FirAliasTagOpInterface op,
+                                           PassState &state) {
+  mlir::func::FuncOp func = op->getParentOfType<mlir::func::FuncOp>();
+  if (!func)
+    return;
+
+  llvm::SmallVector<mlir::Value> accessedOperands = op.getAccessedOperands();
+  assert(accessedOperands.size() == 1 &&
+         "load and store only access one address");
+  mlir::Value memref = accessedOperands.front();
+
+  // skip boxes. These get an "any descriptor access" tag in TBAABuilder
+  // (CodeGen). I didn't see any speedup from giving each box a separate TBAA
+  // type.
+  if (mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(memref.getType())))
+    return;
+  LLVM_DEBUG(llvm::dbgs() << "Analysing " << op << "\n");
+
+  const fir::AliasAnalysis::Source &source = state.getSource(memref);
+  if (source.isTargetOrPointer()) {
+    LLVM_DEBUG(llvm::dbgs().indent(2) << "Skipping TARGET/POINTER\n");
+    // These will get an "any data access" tag in TBAABuilder (CodeGen): causing
+    // them to "MayAlias" with all non-box accesses
+    return;
+  }
+
+  mlir::LLVM::TBAATagAttr tag;
+  // TBAA for dummy arguments
+  if (enableDummyArgs &&
+      source.kind == fir::AliasAnalysis::SourceKind::Argument) {
+    LLVM_DEBUG(llvm::dbgs().indent(2)
+               << "Found reference to dummy argument at " << *op << "\n");
+    std::string name = getFuncArgName(source.u.get<mlir::Value>());
+    if (!name.empty())
+      tag = state.getFuncTree(func).dummyArgDataTree.getTag(name);
+    else
+      LLVM_DEBUG(llvm::dbgs().indent(2)
+                 << "WARN: couldn't find a name for dummy argument " << *op
+                 << "\n");
+
+    // TBAA for global variables
+  } else if (enableGlobals &&
+             source.kind == fir::AliasAnalysis::SourceKind::Global) {
+    mlir::SymbolRefAttr glbl = source.u.get<mlir::SymbolRefAttr>();
+    const char *name = glbl.getRootReference().data();
+    LLVM_DEBUG(llvm::dbgs().indent(2) << "Found reference to global " << name
+                                      << " at " << *op << "\n");
+    tag = state.getFuncTree(func).globalDataTree.getTag(name);
+
+    // TBAA for local allocations
+  } else if (enableLocalAllocs &&
+             source.kind == fir::AliasAnalysis::SourceKind::Allocate) {
+    std::optional<llvm::StringRef> name;
+    mlir::Operation *sourceOp = source.u.get<mlir::Value>().getDefiningOp();
+    if (auto alloc = mlir::dyn_cast_or_null<fir::AllocaOp>(sourceOp))
+      name = alloc.getUniqName();
+    else if (auto alloc = mlir::dyn_cast_or_null<fir::AllocMemOp>(sourceOp))
+      name = alloc.getUniqName();
+    if (name) {
+      LLVM_DEBUG(llvm::dbgs().indent(2) << "Found reference to allocation "
+                                        << name << " at " << *op << "\n");
+      tag = state.getFuncTree(func).allocatedDataTree.getTag(*name);
+    } else {
+      LLVM_DEBUG(llvm::dbgs().indent(2)
+                 << "WARN: couldn't find a name for allocation " << *op
+                 << "\n");
+    }
+  } else {
+    if (source.kind != fir::AliasAnalysis::SourceKind::Argument &&
+        source.kind != fir::AliasAnalysis::SourceKind::Allocate &&
+        source.kind != fir::AliasAnalysis::SourceKind::Global)
+      LLVM_DEBUG(llvm::dbgs().indent(2)
+                 << "WARN: unsupported value: " << source << "\n");
+  }
+
+  if (tag)
+    op.setTBAATags(mlir::ArrayAttr::get(&getContext(), tag));
+}
+
+void AddAliasTagsPass::runOnOperation() {
+  LLVM_DEBUG(llvm::dbgs() << "=== Begin " DEBUG_TYPE " ===\n");
+
+  // MLIR forbids storing state in a pass because different instances might be
+  // used in different threads.
+  // Instead this pass stores state per mlir::ModuleOp (which is what MLIR
+  // thinks the pass operates on), then the real work of the pass is done in
+  // runOnAliasInterface
+  PassState state;
+
+  mlir::ModuleOp mod = getOperation();
+  mod.walk(
+      [&](fir::FirAliasTagOpInterface op) { runOnAliasInterface(op, state); });
+
+  LLVM_DEBUG(llvm::dbgs() << "=== End " DEBUG_TYPE " ===\n");
+}
+
+std::unique_ptr<mlir::Pass> fir::createAliasTagsPass() {
+  return std::make_unique<AddAliasTagsPass>();
+}
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 428c4c2a1e64408..74a093fe74719b6 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_flang_library(FIRTransforms
   AbstractResult.cpp
+  AddAliasTags.cpp
   AffinePromotion.cpp
   AffineDemotion.cpp
   AnnotateConstant.cpp
diff --git a/flang/test/Transforms/tbaa.fir b/flang/test/Transforms/tbaa.fir
new file mode 100644
index 000000000000000..7825ae60c71e681
--- /dev/null
+++ b/flang/test/Transforms/tbaa.fir
@@ -0,0 +1,175 @@
+// RUN: fir-opt --split-input-file --fir-add-alias-tags %s | FileCheck %s
+
+// subroutine oneArg(a)
+//   integer :: a(:)
+//   a(1) = a(2)
+// end subroutine
+  func.func @_QPonearg(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) {
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %0 = fir.declare %arg0 {uniq_name = "_QFoneargEa"} : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+    %1 = fir.rebox %0 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+    %2 = fir.array_coor %1 %c2 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+    %3 = fir.load %2 : !fir.ref<i32>
+    %4 = fir.array_coor %1 %c1 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+    fir.store %3 to %4 : !fir.ref<i32>
+    return
+  }
+
+// CHECK: #[[ONE_ARG_ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPonearg">
+// CHECK: #[[ONE_ARG_ANY_ACCESS:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ONE_ARG_ROOT]], 0>}>
+// CHECK: #[[ONE_ARG_ANY_DATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ONE_ARG_ANY_ACCESS]], 0>}>
+// CHECK: #[[ONE_ARG_ANY_ARG:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[ONE_ARG_ANY_DATA]], 0>}>
+// CHECK: #[[ONE_ARG_A:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFoneargEa", members = {<#[[ONE_ARG_ANY_ARG]], 0>}>
+// CHECK: #[[ONE_ARG_A_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ONE_ARG_A]], access_type = #[[ONE_ARG_A]], offset = 0>
+
+// CHECK-LABEL:   func.func @_QPonearg(
+// CHECK-SAME:                         %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) {
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_2:.*]] = arith.constant 2 : index
+// CHECK:           %[[VAL_3:.*]] = fir.declare %[[VAL_0]] {uniq_name = "_QFoneargEa"} : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:           %[[VAL_4:.*]] = fir.rebox %[[VAL_3]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:           %[[VAL_5:.*]] = fir.array_coor %[[VAL_4]] %[[VAL_2]] : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_5]] {tbaa = [#[[ONE_ARG_A_TAG]]]} : !fir.ref<i32>
+// CHECK:           %[[VAL_7:.*]] = fir.array_coor %[[VAL_4]] %[[VAL_1]] : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:           fir.store %[[VAL_6]] to %[[VAL_7]] {tbaa = [#[[ONE_ARG_A_TAG]]]} : !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+// subroutine twoArg(a, b)
+//   integer :: a(:), b(:)
+//   a(1) = b(1)
+// end subroutine
+  func.func @_QPtwoarg(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}, %arg1: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "b"}) {
+    %c1 = arith.constant 1 : index
+    %0 = fir.declare %arg0 {uniq_name = "_QFtwoargEa"} : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+    %1 = fir.rebox %0 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+    %2 = fir.declare %arg1 {uniq_name = "_QFtwoargEb"} : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+    %3 = fir.rebox %2 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+    %4 = fir.array_coor %3 %c1 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+    %5 = fir.load %4 : !fir.ref<i32>
+    %6 = fir.array_coor %1 %c1 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+    fir.store %5 to %6 : !fir.ref<i32>
+    return
+  }
+
+// CHECK: #[[TWO_ARG_ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtwoarg">
+// CHECK: #[[TWO_ARG_ANY_ACCESS:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[TWO_ARG_ROOT]], 0>}>
+// CHECK: #[[TWO_ARG_ANY_DATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[TWO_ARG_ANY_ACCESS]], 0>}>
+// CHECK: #[[TWO_ARG_ANY_ARG:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[TWO_ARG_ANY_DATA]], 0>}>
+// CHECK: #[[TWO_ARG_B:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtwoargEb", members = {<#[[TWO_ARG_ANY_ARG]], 0>}>
+// CHECK: #[[TWO_ARG_A:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtwoargEa", members = {<#[[TWO_ARG_ANY_ARG]], 0>}>
+// CHECK: #[[TWO_ARG_B_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[TWO_ARG_B]], access_type = #[[TWO_ARG_B]], offset = 0>
+// CHECK: #[[TWO_ARG_A_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[TWO_ARG_A]], access_type = #[[TWO_ARG_A]], offset = 0>
+
+// CHECK-LABEL:   func.func @_QPtwoarg(
+// CHECK-SAME:                         %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"},
+// CHECK-SAME:                         %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "b"}) {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = fir.declare %[[VAL_0]] {uniq_name = "_QFtwoargEa"} : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:           %[[VAL_4:.*]] = fir.rebox %[[VAL_3]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:           %[[VAL_5:.*]] = fir.declare %[[VAL_1]] {uniq_name = "_QFtwoargEb"} : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:           %[[VAL_6:.*]] = fir.rebox %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:           %[[VAL_7:.*]] = fir.array_coor %[[VAL_6]] %[[VAL_2]] : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_7]] {tbaa = [#[[TWO_ARG_B_TAG]]]} : !fir.ref<i32>
+// CHECK:           %[[VAL_9:.*]] = fir.array_coor %[[VAL_4]] %[[VAL_2]] : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:           fir.store %[[VAL_8]] to %[[VAL_9]] {tbaa = [#[[TWO_ARG_A_TAG]]]} : !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+// subroutine targetArg(a, b)
+//   integer, target :: a(:)
+//   integer :: b(:)
+//   a(1) = b(1)
+// end subroutine
+  func.func @_QPtargetarg(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a", fir.target}, %arg1: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "b"}) {
+    %c1 = arith.constant 1 : index
+    %0 = fir.declare %arg0 {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtargetargEa"} : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+    %1 = fir.rebox %0 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+    %2 = fir.declare %arg1 {uniq_name = "_QFtargetargEb"} : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+    %3 = fir.rebox %2 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+    %4 = fir.array_coor %3 %c1 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+    %5 = fir.load %4 : !fir.ref<i32>
+    %6 = fir.array_coor %1 %c1 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+    fir.store %5 to %6 : !fir.ref<i32>
+    return
+  }
+
+// CHECK: #[[TARGET_ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPtargetarg">
+// CHECK: #[[TARGET_ANY_ACCESS:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[TARGET_ROOT]], 0>}>
+// CHECK: #[[TARGET_ANY_DATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[TARGET_ANY_ACCESS]], 0>}>
+// CHECK: #[[TARGET_ANY_ARG:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[TARGET_ANY_DATA]], 0>}>
+// CHECK: #[[TARGET_B:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFtargetargEb", members = {<#[[TARGET_ANY_ARG]], 0>}>
+// CHECK: #[[TARGET_B_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[TARGET_B]], access_type = #[[TARGET_B]], offset = 0>
+// No entry for "dummy arg data/a" because that pointer should get "any data access" becase it has the TARGET attribute
+
+// CHECK-LABEL:   func.func @_QPtargetarg(
+// CHECK-SAME:                            %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a", fir.target},
+// CHECK-SAME:                            %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "b"}) {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_3:.*]] = fir.declare %[[VAL_0]] {fortran_attrs = #{{.*}}<target>, uniq_name = "_QFtargetargEa"} : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:           %[[VAL_4:.*]] = fir.rebox %[[VAL_3]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:           %[[VAL_5:.*]] = fir.declare %[[VAL_1]] {uniq_name = "_QFtargetargEb"} : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:           %[[VAL_6:.*]] = fir.rebox %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:           %[[VAL_7:.*]] = fir.array_coor %[[VAL_6]] %[[VAL_2]] : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_7]] {tbaa = [#[[TARGET_B_TAG]]]} : !fir.ref<i32>
+// CHECK:           %[[VAL_9:.*]] = fir.array_coor %[[VAL_4]] %[[VAL_2]] : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// "any data access" tag is added by TBAABuilder during CodeGen
+// CHECK:           fir.store %[[VAL_8]] to %[[VAL_9]] : !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+// subroutine pointerArg(a, b)
+//   integer, pointer :: a(:)
+//   integer :: b(:)
+//   a(1) = b(1)
+// end subroutine
+  func.func @_QPpointerarg(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {fir.bindc_name = "a"}, %arg1: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "b"}) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %0 = fir.declare %arg0 {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFpointerargEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+    %1 = fir.declare %arg1 {uniq_name = "_QFpointerargEb"} : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+    %2 = fir.rebox %1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+    %3 = fir.array_coor %2 %c1 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+    %4 = fir.load %3 : !fir.ref<i32>
+    %5 = fir.load %0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+    %6:3 = fir.box_dims %5, %c0 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
+    %7 = fir.shift %6#0 : (index) -> !fir.shift<1>
+    %8 = fir.array_coor %5(%7) %c1 : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, !fir.shift<1>, index) -> !fir.ref<i32>
+    fir.store %4 to %8 : !fir.ref<i32>
+    return
+  }
+
+// CHECK: #[[POINTER_ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QPpointerarg">
+// CHECK: #[[POINTER_ANY_ACCESS:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[POINTER_ROOT]], 0>}>
+// CHECK: #[[POINTER_ANY_DATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[POINTER_ANY_ACCESS]], 0>}>
+// CHECK: #[[POINTER_ANY_ARG:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[POINTER_ANY_DATA]], 0>}>
+// CHECK: #[[POINTER_B:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QFpointerargEb", members = {<#[[POINTER_ANY_ARG]], 0>}>
+// CHECK: #[[POINTER_B_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[POINTER_B]], access_type = #[[POINTER_B]], offset = 0>
+// No entry for "dummy arg data/a" because that pointer should get "any data access" becase it has the POINTER attribute
+
+// CHECK-LABEL:   func.func @_QPpointerarg(
+// CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> {fir.bindc_name = "a"},
+// CHECK-SAME:                             %[[VAL_1:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "b"}) {
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_3:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_4:.*]] = fir.declare %[[VAL_0]] {fortran_attrs = #{{.*}}<pointer>, uniq_name = "_QFpointerargEa"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK:           %[[VAL_5:.*]] = fir.declare %[[VAL_1]] {uniq_name = "_QFpointerargEb"} : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:           %[[VAL_6:.*]] = fir.rebox %[[VAL_5]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:           %[[VAL_7:.*]] = fir.array_coor %[[VAL_6]] %[[VAL_3]] : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:           %[[VAL_8:.*]] = fir.load %[[VAL_7]] {tbaa = [#[[POINTER_B_TAG]]]} : !fir.ref<i32>
+// "any descriptor access" tag is added by TBAABuilder during CodeGen
+// CHECK:           %[[VAL_9:.*]] = fir.load %[[VAL_4]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+// CHECK:           %[[VAL_10:.*]]:3 = fir.box_dims %[[VAL_9]], %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_11:.*]] = fir.shift %[[VAL_10]]#0 : (index) -> !fir.shift<1>
+// CHECK:           %[[VAL_12:.*]] = fir.array_coor %[[VAL_9]](%[[VAL_11]]) %[[VAL_3]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, !fir.shift<1>, index) -> !fir.ref<i32>
+// "any data access" tag is added by TBAABuilder during CodeGen
+// CHECK:           fir.store %[[VAL_8]] to %[[VAL_12]] : !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }
diff --git a/flang/test/Transforms/tbaa2.fir b/flang/test/Transforms/tbaa2.fir
new file mode 100644
index 000000000000000..84ba281cce7a956
--- /dev/null
+++ b/flang/test/Transforms/tbaa2.fir
@@ -0,0 +1,386 @@
+// Test fir alias analysis pass on a larger real life code example (from the RFC)
+// RUN: fir-opt --fir-add-alias-tags %s | FileCheck %s
+
+  fir.global @_QMmodEa : !fir.box<!fir.heap<!fir.array<?xf32>>> {
+    %c0 = arith.constant 0 : index
+    %0 = fir.zero_bits !fir.heap<!fir.array<?xf32>>
+    %1 = fir.shape %c0 : (index) -> !fir.shape<1>
+    %2 = fir.embox %0(%1) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+    fir.has_value %2 : !fir.box<!fir.heap<!fir.array<?xf32>>>
+  }
+  fir.global @_QMmodEb : !fir.box<!fir.heap<!fir.array<?xf32>>> {
+    %c0 = arith.constant 0 : index
+    %0 = fir.zero_bits !fir.heap<!fir.array<?xf32>>
+    %1 = fir.shape %c0 : (index) -> !fir.shape<1>
+    %2 = fir.embox %0(%1) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+    fir.has_value %2 : !fir.box<!fir.heap<!fir.array<?xf32>>>
+  }
+  fir.global @_QMmodEdxinv : f32 {
+    %0 = fir.zero_bits f32
+    fir.has_value %0 : f32
+  }
+  fir.global @_QMmodEdyinv : f32 {
+    %0 = fir.zero_bits f32
+    fir.has_value %0 : f32
+  }
+  fir.global @_QMmodExstart : i32 {
+    %0 = fir.zero_bits i32
+    fir.has_value %0 : i32
+  }
+  fir.global @_QMmodEystart : i32 {
+    %0 = fir.zero_bits i32
+    fir.has_value %0 : i32
+  }
+  fir.global @_QMmodEystop : i32 {
+    %0 = fir.zero_bits i32
+    fir.has_value %0 : i32
+  }
+  fir.global @_QMmodEzstart : i32 {
+    %0 = fir.zero_bits i32
+    fir.has_value %0 : i32
+  }
+  fir.global @_QMmodEzstop : i32 {
+    %0 = fir.zero_bits i32
+    fir.has_value %0 : i32
+  }
+
+// CHECK: #[[ROOT:.+]] = #llvm.tbaa_root<id = "Flang function root _QMmodPcallee">
+// CHECK: #[[ANY_ACCESS:.+]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
+// CHECK: #[[ANY_DATA:.+]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANY_ACCESS]], 0>}>
+// CHECK: #[[ANY_ARG:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#[[ANY_DATA]], 0>}>
+// CHECK: #[[ARG_LOW:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeElow", members = {<#[[ANY_ARG]], 0>}>
+// CHECK: #[[ARG_Z:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeEz", members = {<#[[ANY_ARG]], 0>}>
+// CHECK: #[[ARG_Y:.+]] = #llvm.tbaa_type_desc<id = "dummy arg data/_QMmodFcalleeEy", members = {<#[[ANY_ARG]], 0>}>
+// CHECK: #[[ARG_LOW_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_LOW]], access_type = #[[ARG_LOW]], offset = 0>
+// CHECK: #[[ARG_Z_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_Z]], access_type = #[[ARG_Z]], offset = 0>
+// CHECK: #[[ARG_Y_TAG:.+]] = #llvm.tbaa_tag<base_type = #[[ARG_Y]], access_type = #[[ARG_Y]], offset = 0>
+
+  func.func @_QMmodPcallee(%arg0: !fir.box<!fir.array<?x?x?xf32>> {fir.bindc_name = "z"}, %arg1: !fir.box<!fir.array<?x?x?xf32>> {fir.bindc_name = "y"}, %arg2: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>> {fir.bindc_name = "low"}) {
+    %c2 = arith.constant 2 : index
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c1_i32 = arith.constant 1 : i32
+    %0 = fir.address_of(@_QMmodEa) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+    %1 = fir.declare %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmodEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+    %2 = fir.address_of(@_QMmodEb) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+    %3 = fir.declare %2 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmodEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+    %4 = fir.address_of(@_QMmodEdxinv) : !fir.ref<f32>
+    %5 = fir.declare %4 {uniq_name = "_QMmodEdxinv"} : (!fir.ref<f32>) -> !fir.ref<f32>
+    %6 = fir.address_of(@_QMmodEdyinv) : !fir.ref<f32>
+    %7 = fir.declare %6 {uniq_name = "_QMmodEdyinv"} : (!fir.ref<f32>) -> !fir.ref<f32>
+    %8 = fir.address_of(@_QMmodExstart) : !fir.ref<i32>
+    %9 = fir.declare %8 {uniq_name = "_QMmodExstart"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    %10 = fir.address_of(@_QMmodEystart) : !fir.ref<i32>
+    %11 = fir.declare %10 {uniq_name = "_QMmodEystart"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    %12 = fir.address_of(@_QMmodEystop) : !fir.ref<i32>
+    %13 = fir.declare %12 {uniq_name = "_QMmodEystop"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    %14 = fir.address_of(@_QMmodEzstart) : !fir.ref<i32>
+    %15 = fir.declare %14 {uniq_name = "_QMmodEzstart"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    %16 = fir.address_of(@_QMmodEzstop) : !fir.ref<i32>
+    %17 = fir.declare %16 {uniq_name = "_QMmodEzstop"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    %18 = fir.alloca f32 {bindc_name = "dxold", uniq_name = "_QMmodFcalleeEdxold"}
+    %19 = fir.declare %18 {uniq_name = "_QMmodFcalleeEdxold"} : (!fir.ref<f32>) -> !fir.ref<f32>
+    %20 = fir.alloca f32 {bindc_name = "dzinv", uniq_name = "_QMmodFcalleeEdzinv"}
+    %21 = fir.declare %20 {uniq_name = "_QMmodFcalleeEdzinv"} : (!fir.ref<f32>) -> !fir.ref<f32>
+    %22 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMmodFcalleeEi"}
+    %23 = fir.declare %22 {uniq_name = "_QMmodFcalleeEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    %24 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QMmodFcalleeEj"}
+    %25 = fir.declare %24 {uniq_name = "_QMmodFcalleeEj"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    %26 = fir.alloca i32 {bindc_name = "k", uniq_name = "_QMmodFcalleeEk"}
+    %27 = fir.declare %26 {uniq_name = "_QMmodFcalleeEk"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    %28 = fir.declare %arg2 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMmodFcalleeElow"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>>
+    %29 = fir.declare %arg1 {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMmodFcalleeEy"} : (!fir.box<!fir.array<?x?x?xf32>>) -> !fir.box<!fir.array<?x?x?xf32>>
+    %30 = fir.rebox %29 : (!fir.box<!fir.array<?x?x?xf32>>) -> !fir.box<!fir.array<?x?x?xf32>>
+    %31 = fir.declare %arg0 {fortran_attrs = #fir.var_attrs<intent_in>, uniq_name = "_QMmodFcalleeEz"} : (!fir.box<!fir.array<?x?x?xf32>>) -> !fir.box<!fir.array<?x?x?xf32>>
+    %32 = fir.rebox %31 : (!fir.box<!fir.array<?x?x?xf32>>) -> !fir.box<!fir.array<?x?x?xf32>>
+    %33 = fir.load %15 : !fir.ref<i32>
+    %34 = arith.addi %33, %c1_i32 : i32
+    %35 = fir.convert %34 : (i32) -> index
+    %36 = fir.load %17 : !fir.ref<i32>
+    %37 = fir.convert %36 : (i32) -> index
+    %38 = fir.convert %35 : (index) -> i32
+    %39:2 = fir.do_loop %arg3 = %35 to %37 step %c1 iter_args(%arg4 = %38) -> (index, i32) {
+      fir.store %arg4 to %27 : !fir.ref<i32>
+      %40 = fir.load %11 : !fir.ref<i32>
+      %41 = arith.addi %40, %c1_i32 : i32
+      %42 = fir.convert %41 : (i32) -> index
+      %43 = fir.load %13 : !fir.ref<i32>
+      %44 = fir.convert %43 : (i32) -> index
+      %45 = fir.convert %42 : (index) -> i32
+      %46:2 = fir.do_loop %arg5 = %42 to %44 step %c1 iter_args(%arg6 = %45) -> (index, i32) {
+        fir.store %arg6 to %25 : !fir.ref<i32>
+        %51 = fir.load %9 : !fir.ref<i32>
+        %52 = arith.addi %51, %c1_i32 : i32
+        %53 = fir.convert %52 : (i32) -> index
+        %54 = fir.convert %53 : (index) -> i32
+        %55:2 = fir.do_loop %arg7 = %53 to %c0 step %c1 iter_args(%arg8 = %54) -> (index, i32) {
+          fir.store %arg8 to %23 : !fir.ref<i32>
+          %60 = fir.load %28 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>>
+          %61 = fir.load %23 : !fir.ref<i32>
+          %62 = fir.convert %61 : (i32) -> i64
+          %63 = fir.load %25 : !fir.ref<i32>
+          %64 = fir.convert %63 : (i32) -> i64
+          %65 = fir.load %27 : !fir.ref<i32>
+          %66 = fir.convert %65 : (i32) -> i64
+          %67 = fir.box_addr %60 : (!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>) -> !fir.heap<!fir.array<?x?x?xf32>>
+          %68:3 = fir.box_dims %60, %c0 : (!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>, index) -> (index, index, index)
+          %69:3 = fir.box_dims %60, %c1 : (!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>, index) -> (index, index, index)
+          %70:3 = fir.box_dims %60, %c2 : (!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>, index) -> (index, index, index)
+          %71 = fir.shape_shift %68#0, %68#1, %69#0, %69#1, %70#0, %70#1 : (index, index, index, index, index, index) -> !fir.shapeshift<3>
+          %72 = fir.array_coor %67(%71) %62, %64, %66 : (!fir.heap<!fir.array<?x?x?xf32>>, !fir.shapeshift<3>, i64, i64, i64) -> !fir.ref<f32>
+          %73 = fir.load %72 : !fir.ref<f32>
+          fir.store %73 to %19 : !fir.ref<f32>
+          %74 = fir.load %1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+          %75 = fir.load %25 : !fir.ref<i32>
+          %76 = fir.convert %75 : (i32) -> i64
+          %77 = fir.box_addr %74 : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
+          %78:3 = fir.box_dims %74, %c0 : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+          %79 = fir.shape_shift %78#0, %78#1 : (index, index) -> !fir.shapeshift<1>
+          %80 = fir.array_coor %77(%79) %76 : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>, i64) -> !fir.ref<f32>
+          %81 = fir.load %80 : !fir.ref<f32>
+          %82 = fir.load %28 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>>
+          %83 = fir.load %23 : !fir.ref<i32>
+          %84 = fir.convert %83 : (i32) -> i64
+          %85 = fir.load %27 : !fir.ref<i32>
+          %86 = fir.convert %85 : (i32) -> i64
+          %87 = fir.box_addr %82 : (!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>) -> !fir.heap<!fir.array<?x?x?xf32>>
+          %88:3 = fir.box_dims %82, %c0 : (!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>, index) -> (index, index, index)
+          %89:3 = fir.box_dims %82, %c1 : (!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>, index) -> (index, index, index)
+          %90:3 = fir.box_dims %82, %c2 : (!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>, index) -> (index, index, index)
+          %91 = fir.shape_shift %88#0, %88#1, %89#0, %89#1, %90#0, %90#1 : (index, index, index, index, index, index) -> !fir.shapeshift<3>
+          %92 = fir.array_coor %87(%91) %84, %76, %86 : (!fir.heap<!fir.array<?x?x?xf32>>, !fir.shapeshift<3>, i64, i64, i64) -> !fir.ref<f32>
+          %93 = fir.load %92 : !fir.ref<f32>
+          %94 = arith.mulf %81, %93 fastmath<contract> : f32
+          %95 = fir.load %3 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+          %96 = fir.box_addr %95 : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
+          %97:3 = fir.box_dims %95, %c0 : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+          %98 = fir.shape_shift %97#0, %97#1 : (index, index) -> !fir.shapeshift<1>
+          %99 = fir.array_coor %96(%98) %76 : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>, i64) -> !fir.ref<f32>
+          %100 = fir.load %99 : !fir.ref<f32>
+          %101 = fir.array_coor %32 %84, %76, %86 : (!fir.box<!fir.array<?x?x?xf32>>, i64, i64, i64) -> !fir.ref<f32>
+          %102 = fir.load %101 : !fir.ref<f32>
+          %103 = arith.subi %75, %c1_i32 : i32
+          %104 = fir.convert %103 : (i32) -> i64
+          %105 = fir.array_coor %32 %84, %104, %86 : (!fir.box<!fir.array<?x?x?xf32>>, i64, i64, i64) -> !fir.ref<f32>
+          %106 = fir.load %105 : !fir.ref<f32>
+          %107 = arith.subf %102, %106 fastmath<contract> : f32
+          %108 = fir.no_reassoc %107 : f32
+          %109 = fir.load %7 : !fir.ref<f32>
+          %110 = arith.mulf %108, %109 fastmath<contract> : f32
+          %111 = arith.subi %85, %c1_i32 : i32
+          %112 = fir.convert %111 : (i32) -> i64
+          %113 = fir.array_coor %30 %84, %76, %112 : (!fir.box<!fir.array<?x?x?xf32>>, i64, i64, i64) -> !fir.ref<f32>
+          %114 = fir.load %113 : !fir.ref<f32>
+          %115 = fir.array_coor %30 %84, %76, %86 : (!fir.box<!fir.array<?x?x?xf32>>, i64, i64, i64) -> !fir.ref<f32>
+          %116 = fir.load %115 : !fir.ref<f32>
+          %117 = arith.subf %114, %116 fastmath<contract> : f32
+          %118 = fir.no_reassoc %117 : f32
+          %119 = fir.load %21 : !fir.ref<f32>
+          %120 = arith.mulf %118, %119 fastmath<contract> : f32
+          %121 = arith.addf %110, %120 fastmath<contract> : f32
+          %122 = fir.no_reassoc %121 : f32
+          %123 = arith.mulf %100, %122 fastmath<contract> : f32
+          %124 = arith.addf %94, %123 fastmath<contract> : f32
+          fir.store %124 to %92 : !fir.ref<f32>
+          %125 = arith.addi %arg7, %c1 : index
+          %126 = fir.convert %c1 : (index) -> i32
+          %127 = fir.load %23 : !fir.ref<i32>
+          %128 = arith.addi %127, %126 : i32
+          fir.result %125, %128 : index, i32
+        }
+        fir.store %55#1 to %23 : !fir.ref<i32>
+        %56 = arith.addi %arg5, %c1 : index
+        %57 = fir.convert %c1 : (index) -> i32
+        %58 = fir.load %25 : !fir.ref<i32>
+        %59 = arith.addi %58, %57 : i32
+        fir.result %56, %59 : index, i32
+      }
+      fir.store %46#1 to %25 : !fir.ref<i32>
+      %47 = arith.addi %arg3, %c1 : index
+      %48 = fir.convert %c1 : (index) -> i32
+      %49 = fir.load %27 : !fir.ref<i32>
+      %50 = arith.addi %49, %48 : i32
+      fir.result %47, %50 : index, i32
+    }
+    fir.store %39#1 to %27 : !fir.ref<i32>
+    return
+  }
+// CHECK-LABEL:   func.func @_QMmodPcallee(
+// CHECK-SAME:                             %[[VAL_0:.*]]: !fir.box<!fir.array<?x?x?xf32>> {fir.bindc_name = "z"},
+// CHECK-SAME:                             %[[VAL_1:.*]]: !fir.box<!fir.array<?x?x?xf32>> {fir.bindc_name = "y"},
+// CHECK-SAME:                             %[[VAL_2:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>> {fir.bindc_name = "low"}) {
+// CHECK:           %[[VAL_3:.*]] = arith.constant 2 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QMmodEa) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK:           %[[VAL_8:.*]] = fir.declare %[[VAL_7]] {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QMmodEa"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK:           %[[VAL_9:.*]] = fir.address_of(@_QMmodEb) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK:           %[[VAL_10:.*]] = fir.declare %[[VAL_9]] {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QMmodEb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK:           %[[VAL_11:.*]] = fir.address_of(@_QMmodEdxinv) : !fir.ref<f32>
+// CHECK:           %[[VAL_12:.*]] = fir.declare %[[VAL_11]] {uniq_name = "_QMmodEdxinv"} : (!fir.ref<f32>) -> !fir.ref<f32>
+// CHECK:           %[[VAL_13:.*]] = fir.address_of(@_QMmodEdyinv) : !fir.ref<f32>
+// CHECK:           %[[VAL_14:.*]] = fir.declare %[[VAL_13]] {uniq_name = "_QMmodEdyinv"} : (!fir.ref<f32>) -> !fir.ref<f32>
+// CHECK:           %[[VAL_15:.*]] = fir.address_of(@_QMmodExstart) : !fir.ref<i32>
+// CHECK:           %[[VAL_16:.*]] = fir.declare %[[VAL_15]] {uniq_name = "_QMmodExstart"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QMmodEystart) : !fir.ref<i32>
+// CHECK:           %[[VAL_18:.*]] = fir.declare %[[VAL_17]] {uniq_name = "_QMmodEystart"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_19:.*]] = fir.address_of(@_QMmodEystop) : !fir.ref<i32>
+// CHECK:           %[[VAL_20:.*]] = fir.declare %[[VAL_19]] {uniq_name = "_QMmodEystop"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_21:.*]] = fir.address_of(@_QMmodEzstart) : !fir.ref<i32>
+// CHECK:           %[[VAL_22:.*]] = fir.declare %[[VAL_21]] {uniq_name = "_QMmodEzstart"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_23:.*]] = fir.address_of(@_QMmodEzstop) : !fir.ref<i32>
+// CHECK:           %[[VAL_24:.*]] = fir.declare %[[VAL_23]] {uniq_name = "_QMmodEzstop"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_25:.*]] = fir.alloca f32 {bindc_name = "dxold", uniq_name = "_QMmodFcalleeEdxold"}
+// CHECK:           %[[VAL_26:.*]] = fir.declare %[[VAL_25]] {uniq_name = "_QMmodFcalleeEdxold"} : (!fir.ref<f32>) -> !fir.ref<f32>
+// CHECK:           %[[VAL_27:.*]] = fir.alloca f32 {bindc_name = "dzinv", uniq_name = "_QMmodFcalleeEdzinv"}
+// CHECK:           %[[VAL_28:.*]] = fir.declare %[[VAL_27]] {uniq_name = "_QMmodFcalleeEdzinv"} : (!fir.ref<f32>) -> !fir.ref<f32>
+// CHECK:           %[[VAL_29:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QMmodFcalleeEi"}
+// CHECK:           %[[VAL_30:.*]] = fir.declare %[[VAL_29]] {uniq_name = "_QMmodFcalleeEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_31:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QMmodFcalleeEj"}
+// CHECK:           %[[VAL_32:.*]] = fir.declare %[[VAL_31]] {uniq_name = "_QMmodFcalleeEj"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_33:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QMmodFcalleeEk"}
+// CHECK:           %[[VAL_34:.*]] = fir.declare %[[VAL_33]] {uniq_name = "_QMmodFcalleeEk"} : (!fir.ref<i32>) -> !fir.ref<i32>
+// CHECK:           %[[VAL_35:.*]] = fir.declare %[[VAL_2]] {fortran_attrs = #{{.*}}<allocatable>, uniq_name = "_QMmodFcalleeElow"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>>
+// CHECK:           %[[VAL_36:.*]] = fir.declare %[[VAL_1]] {fortran_attrs = #{{.*}}<intent_in>, uniq_name = "_QMmodFcalleeEy"} : (!fir.box<!fir.array<?x?x?xf32>>) -> !fir.box<!fir.array<?x?x?xf32>>
+// CHECK:           %[[VAL_37:.*]] = fir.rebox %[[VAL_36]] : (!fir.box<!fir.array<?x?x?xf32>>) -> !fir.box<!fir.array<?x?x?xf32>>
+// CHECK:           %[[VAL_38:.*]] = fir.declare %[[VAL_0]] {fortran_attrs = #{{.*}}<intent_in>, uniq_name = "_QMmodFcalleeEz"} : (!fir.box<!fir.array<?x?x?xf32>>) -> !fir.box<!fir.array<?x?x?xf32>>
+// CHECK:           %[[VAL_39:.*]] = fir.rebox %[[VAL_38]] : (!fir.box<!fir.array<?x?x?xf32>>) -> !fir.box<!fir.array<?x?x?xf32>>
+// TODO: read from global assumed to always alias
+// CHECK:           %[[VAL_40:.*]] = fir.load %[[VAL_22]] : !fir.ref<i32>
+// CHECK:           %[[VAL_41:.*]] = arith.addi %[[VAL_40]], %[[VAL_6]] : i32
+// CHECK:           %[[VAL_42:.*]] = fir.convert %[[VAL_41]] : (i32) -> index
+// TODO: read from global assumed to always alias
+// CHECK:           %[[VAL_43:.*]] = fir.load %[[VAL_24]] : !fir.ref<i32>
+// CHECK:           %[[VAL_44:.*]] = fir.convert %[[VAL_43]] : (i32) -> index
+// CHECK:           %[[VAL_45:.*]] = fir.convert %[[VAL_42]] : (index) -> i32
+// CHECK:           %[[VAL_46:.*]]:2 = fir.do_loop %[[VAL_47:.*]] = %[[VAL_42]] to %[[VAL_44]] step %[[VAL_5]] iter_args(%[[VAL_48:.*]] = %[[VAL_45]]) -> (index, i32) {
+// CHECK:             fir.store %[[VAL_48]] to %[[VAL_34]] : !fir.ref<i32>
+// TODO: read from global assumed to always alias
+// CHECK:             %[[VAL_49:.*]] = fir.load %[[VAL_18]] : !fir.ref<i32>
+// CHECK:             %[[VAL_50:.*]] = arith.addi %[[VAL_49]], %[[VAL_6]] : i32
+// CHECK:             %[[VAL_51:.*]] = fir.convert %[[VAL_50]] : (i32) -> index
+// TODO: read from global assumed to always alias
+// CHECK:             %[[VAL_52:.*]] = fir.load %[[VAL_20]] : !fir.ref<i32>
+// CHECK:             %[[VAL_53:.*]] = fir.convert %[[VAL_52]] : (i32) -> index
+// CHECK:             %[[VAL_54:.*]] = fir.convert %[[VAL_51]] : (index) -> i32
+// CHECK:             %[[VAL_55:.*]]:2 = fir.do_loop %[[VAL_56:.*]] = %[[VAL_51]] to %[[VAL_53]] step %[[VAL_5]] iter_args(%[[VAL_57:.*]] = %[[VAL_54]]) -> (index, i32) {
+// CHECK:               fir.store %[[VAL_57]] to %[[VAL_32]] : !fir.ref<i32>
+// TODO: read from global assumed to always alias
+// CHECK:               %[[VAL_58:.*]] = fir.load %[[VAL_16]] : !fir.ref<i32>
+// CHECK:               %[[VAL_59:.*]] = arith.addi %[[VAL_58]], %[[VAL_6]] : i32
+// CHECK:               %[[VAL_60:.*]] = fir.convert %[[VAL_59]] : (i32) -> index
+// CHECK:               %[[VAL_61:.*]] = fir.convert %[[VAL_60]] : (index) -> i32
+// CHECK:               %[[VAL_62:.*]]:2 = fir.do_loop %[[VAL_63:.*]] = %[[VAL_60]] to %[[VAL_4]] step %[[VAL_5]] iter_args(%[[VAL_64:.*]] = %[[VAL_61]]) -> (index, i32) {
+// TODO: local allocation assumed to always alias
+// CHECK:                 fir.store %[[VAL_64]] to %[[VAL_30]] : !fir.ref<i32>
+// load from box tagged in CodeGen
+// CHECK:                 %[[VAL_65:.*]] = fir.load %[[VAL_35]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>>
+// TODO: local allocation assumed to always alias
+// CHECK:                 %[[VAL_66:.*]] = fir.load %[[VAL_30]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_67:.*]] = fir.convert %[[VAL_66]] : (i32) -> i64
+// TODO: local allocation assumed to always alias
+// CHECK:                 %[[VAL_68:.*]] = fir.load %[[VAL_32]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_69:.*]] = fir.convert %[[VAL_68]] : (i32) -> i64
+// TODO: local allocation assumed to always alias
+// CHECK:                 %[[VAL_70:.*]] = fir.load %[[VAL_34]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_71:.*]] = fir.convert %[[VAL_70]] : (i32) -> i64
+// CHECK:                 %[[VAL_72:.*]] = fir.box_addr %[[VAL_65]] : (!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>) -> !fir.heap<!fir.array<?x?x?xf32>>
+// CHECK:                 %[[VAL_73:.*]]:3 = fir.box_dims %[[VAL_65]], %[[VAL_4]] : (!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_74:.*]]:3 = fir.box_dims %[[VAL_65]], %[[VAL_5]] : (!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_75:.*]]:3 = fir.box_dims %[[VAL_65]], %[[VAL_3]] : (!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_76:.*]] = fir.shape_shift %[[VAL_73]]#0, %[[VAL_73]]#1, %[[VAL_74]]#0, %[[VAL_74]]#1, %[[VAL_75]]#0, %[[VAL_75]]#1 : (index, index, index, index, index, index) -> !fir.shapeshift<3>
+// CHECK:                 %[[VAL_77:.*]] = fir.array_coor %[[VAL_72]](%[[VAL_76]]) %[[VAL_67]], %[[VAL_69]], %[[VAL_71]] : (!fir.heap<!fir.array<?x?x?xf32>>, !fir.shapeshift<3>, i64, i64, i64) -> !fir.ref<f32>
+// CHECK:                 %[[VAL_78:.*]] = fir.load %[[VAL_77]] {tbaa = [#[[ARG_LOW_TAG]]]} : !fir.ref<f32>
+// CHECK:                 fir.store %[[VAL_78]] to %[[VAL_26]] : !fir.ref<f32>
+// load from box tagged in CodeGen
+// CHECK:                 %[[VAL_79:.*]] = fir.load %[[VAL_8]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// TODO: local allocation assumed to always alias
+// CHECK:                 %[[VAL_80:.*]] = fir.load %[[VAL_32]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_81:.*]] = fir.convert %[[VAL_80]] : (i32) -> i64
+// CHECK:                 %[[VAL_82:.*]] = fir.box_addr %[[VAL_79]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
+// CHECK:                 %[[VAL_83:.*]]:3 = fir.box_dims %[[VAL_79]], %[[VAL_4]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_84:.*]] = fir.shape_shift %[[VAL_83]]#0, %[[VAL_83]]#1 : (index, index) -> !fir.shapeshift<1>
+// CHECK:                 %[[VAL_85:.*]] = fir.array_coor %[[VAL_82]](%[[VAL_84]]) %[[VAL_81]] : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>, i64) -> !fir.ref<f32>
+// load from global variable
+// CHECK:                 %[[VAL_86:.*]] = fir.load %[[VAL_85]] : !fir.ref<f32>
+// load from box
+// CHECK:                 %[[VAL_87:.*]] = fir.load %[[VAL_35]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>>
+// load from local allocation
+// CHECK:                 %[[VAL_88:.*]] = fir.load %[[VAL_30]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_89:.*]] = fir.convert %[[VAL_88]] : (i32) -> i64
+// load from local allocation
+// CHECK:                 %[[VAL_90:.*]] = fir.load %[[VAL_34]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_91:.*]] = fir.convert %[[VAL_90]] : (i32) -> i64
+// CHECK:                 %[[VAL_92:.*]] = fir.box_addr %[[VAL_87]] : (!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>) -> !fir.heap<!fir.array<?x?x?xf32>>
+// CHECK:                 %[[VAL_93:.*]]:3 = fir.box_dims %[[VAL_87]], %[[VAL_4]] : (!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_94:.*]]:3 = fir.box_dims %[[VAL_87]], %[[VAL_5]] : (!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_95:.*]]:3 = fir.box_dims %[[VAL_87]], %[[VAL_3]] : (!fir.box<!fir.heap<!fir.array<?x?x?xf32>>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_96:.*]] = fir.shape_shift %[[VAL_93]]#0, %[[VAL_93]]#1, %[[VAL_94]]#0, %[[VAL_94]]#1, %[[VAL_95]]#0, %[[VAL_95]]#1 : (index, index, index, index, index, index) -> !fir.shapeshift<3>
+// CHECK:                 %[[VAL_97:.*]] = fir.array_coor %[[VAL_92]](%[[VAL_96]]) %[[VAL_89]], %[[VAL_81]], %[[VAL_91]] : (!fir.heap<!fir.array<?x?x?xf32>>, !fir.shapeshift<3>, i64, i64, i64) -> !fir.ref<f32>
+// CHECK:                 %[[VAL_98:.*]] = fir.load %[[VAL_97]] {tbaa = [#[[ARG_LOW_TAG]]]} : !fir.ref<f32>
+// CHECK:                 %[[VAL_99:.*]] = arith.mulf %[[VAL_86]], %[[VAL_98]] fastmath<contract> : f32
+// load from box
+// CHECK:                 %[[VAL_100:.*]] = fir.load %[[VAL_10]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK:                 %[[VAL_101:.*]] = fir.box_addr %[[VAL_100]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
+// CHECK:                 %[[VAL_102:.*]]:3 = fir.box_dims %[[VAL_100]], %[[VAL_4]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+// CHECK:                 %[[VAL_103:.*]] = fir.shape_shift %[[VAL_102]]#0, %[[VAL_102]]#1 : (index, index) -> !fir.shapeshift<1>
+// CHECK:                 %[[VAL_104:.*]] = fir.array_coor %[[VAL_101]](%[[VAL_103]]) %[[VAL_81]] : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>, i64) -> !fir.ref<f32>
+// load from global variable
+// CHECK:                 %[[VAL_105:.*]] = fir.load %[[VAL_104]] : !fir.ref<f32>
+// CHECK:                 %[[VAL_106:.*]] = fir.array_coor %[[VAL_39]] %[[VAL_89]], %[[VAL_81]], %[[VAL_91]] : (!fir.box<!fir.array<?x?x?xf32>>, i64, i64, i64) -> !fir.ref<f32>
+// CHECK:                 %[[VAL_107:.*]] = fir.load %[[VAL_106]] {tbaa = [#[[ARG_Z_TAG]]]} : !fir.ref<f32>
+// CHECK:                 %[[VAL_108:.*]] = arith.subi %[[VAL_80]], %[[VAL_6]] : i32
+// CHECK:                 %[[VAL_109:.*]] = fir.convert %[[VAL_108]] : (i32) -> i64
+// CHECK:                 %[[VAL_110:.*]] = fir.array_coor %[[VAL_39]] %[[VAL_89]], %[[VAL_109]], %[[VAL_91]] : (!fir.box<!fir.array<?x?x?xf32>>, i64, i64, i64) -> !fir.ref<f32>
+// CHECK:                 %[[VAL_111:.*]] = fir.load %[[VAL_110]] {tbaa = [#[[ARG_Z_TAG]]]} : !fir.ref<f32>
+// CHECK:                 %[[VAL_112:.*]] = arith.subf %[[VAL_107]], %[[VAL_111]] fastmath<contract> : f32
+// CHECK:                 %[[VAL_113:.*]] = fir.no_reassoc %[[VAL_112]] : f32
+// load from global variable
+// CHECK:                 %[[VAL_114:.*]] = fir.load %[[VAL_14]] : !fir.ref<f32>
+// CHECK:                 %[[VAL_115:.*]] = arith.mulf %[[VAL_113]], %[[VAL_114]] fastmath<contract> : f32
+// CHECK:                 %[[VAL_116:.*]] = arith.subi %[[VAL_90]], %[[VAL_6]] : i32
+// CHECK:                 %[[VAL_117:.*]] = fir.convert %[[VAL_116]] : (i32) -> i64
+// CHECK:                 %[[VAL_118:.*]] = fir.array_coor %[[VAL_37]] %[[VAL_89]], %[[VAL_81]], %[[VAL_117]] : (!fir.box<!fir.array<?x?x?xf32>>, i64, i64, i64) -> !fir.ref<f32>
+// CHECK:                 %[[VAL_119:.*]] = fir.load %[[VAL_118]] {tbaa = [#[[ARG_Y_TAG]]]} : !fir.ref<f32>
+// CHECK:                 %[[VAL_120:.*]] = fir.array_coor %[[VAL_37]] %[[VAL_89]], %[[VAL_81]], %[[VAL_91]] : (!fir.box<!fir.array<?x?x?xf32>>, i64, i64, i64) -> !fir.ref<f32>
+// CHECK:                 %[[VAL_121:.*]] = fir.load %[[VAL_120]] {tbaa = [#[[ARG_Y_TAG]]]} : !fir.ref<f32>
+// CHECK:                 %[[VAL_122:.*]] = arith.subf %[[VAL_119]], %[[VAL_121]] fastmath<contract> : f32
+// CHECK:                 %[[VAL_123:.*]] = fir.no_reassoc %[[VAL_122]] : f32
+// load from local allocation
+// CHECK:                 %[[VAL_124:.*]] = fir.load %[[VAL_28]] : !fir.ref<f32>
+// CHECK:                 %[[VAL_125:.*]] = arith.mulf %[[VAL_123]], %[[VAL_124]] fastmath<contract> : f32
+// CHECK:                 %[[VAL_126:.*]] = arith.addf %[[VAL_115]], %[[VAL_125]] fastmath<contract> : f32
+// CHECK:                 %[[VAL_127:.*]] = fir.no_reassoc %[[VAL_126]] : f32
+// CHECK:                 %[[VAL_128:.*]] = arith.mulf %[[VAL_105]], %[[VAL_127]] fastmath<contract> : f32
+// CHECK:                 %[[VAL_129:.*]] = arith.addf %[[VAL_99]], %[[VAL_128]] fastmath<contract> : f32
+// CHECK:                 fir.store %[[VAL_129]] to %[[VAL_97]] {tbaa = [#[[ARG_LOW_TAG]]]} : !fir.ref<f32>
+// CHECK:                 %[[VAL_130:.*]] = arith.addi %[[VAL_63]], %[[VAL_5]] : index
+// CHECK:                 %[[VAL_131:.*]] = fir.convert %[[VAL_5]] : (index) -> i32
+// load from local allocation
+// CHECK:                 %[[VAL_132:.*]] = fir.load %[[VAL_30]] : !fir.ref<i32>
+// CHECK:                 %[[VAL_133:.*]] = arith.addi %[[VAL_132]], %[[VAL_131]] : i32
+// CHECK:                 fir.result %[[VAL_130]], %[[VAL_133]] : index, i32
+// CHECK:               }
+// store to local allocation
+// CHECK:               fir.store %[[VAL_134:.*]]#1 to %[[VAL_30]] : !fir.ref<i32>
+// CHECK:               %[[VAL_135:.*]] = arith.addi %[[VAL_56]], %[[VAL_5]] : index
+// CHECK:               %[[VAL_136:.*]] = fir.convert %[[VAL_5]] : (index) -> i32
+// local allocation:
+// CHECK:               %[[VAL_137:.*]] = fir.load %[[VAL_32]] : !fir.ref<i32>
+// CHECK:               %[[VAL_138:.*]] = arith.addi %[[VAL_137]], %[[VAL_136]] : i32
+// CHECK:               fir.result %[[VAL_135]], %[[VAL_138]] : index, i32
+// CHECK:             }
+// local allocation:
+// CHECK:             fir.store %[[VAL_139:.*]]#1 to %[[VAL_32]] : !fir.ref<i32>
+// CHECK:             %[[VAL_140:.*]] = arith.addi %[[VAL_47]], %[[VAL_5]] : index
+// CHECK:             %[[VAL_141:.*]] = fir.convert %[[VAL_5]] : (index) -> i32
+// local allocation:
+// CHECK:             %[[VAL_142:.*]] = fir.load %[[VAL_34]] : !fir.ref<i32>
+// CHECK:             %[[VAL_143:.*]] = arith.addi %[[VAL_142]], %[[VAL_141]] : i32
+// CHECK:             fir.result %[[VAL_140]], %[[VAL_143]] : index, i32
+// CHECK:           }
+// local allocation:
+// CHECK:           fir.store %[[VAL_144:.*]]#1 to %[[VAL_34]] : !fir.ref<i32>
+// CHECK:           return
+// CHECK:         }

>From 612ac44a65c9c10014a141a5a7d751384af081b1 Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko <atrosinenko at accesssoftek.com>
Date: Wed, 11 Oct 2023 13:52:41 +0300
Subject: [PATCH 04/38] [AArch64][PAC] Check authenticated LR value during tail
 call

When performing a tail call, check the value of LR register after
authentication to prevent the callee from signing and spilling an
untrusted value. This commit implements a few variants of check,
more can be added later.

If it is safe to assume that executable pages are always readable,
LR can be checked just by dereferencing the LR value via LDR.

As an alternative, LR can be checked as follows:

    ; lowered AUT* instruction
    ; <some variant of check that LR contains a valid address>
    b.cond break_block
  ret_block:
    ; lowered TCRETURN
  break_block:
    brk 0xc471

As the existing methods either break the compatibility with execute-only
memory mappings or can degrade the performance, they are disabled by
default and can be explicitly enabled with a command line option.

Individual subtargets can opt-in to use one of the available methods
by updating AArch64FrameLowering::getAuthenticatedLRCheckMethod().

Reviewed By: kristof.beyls

Differential Revision: https://reviews.llvm.org/D156716
---
 .../Target/AArch64/AArch64FrameLowering.cpp   |  30 +--
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  32 +++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |   3 +
 .../AArch64/AArch64MachineFunctionInfo.cpp    |  22 ++-
 .../AArch64/AArch64MachineFunctionInfo.h      |   2 +
 .../lib/Target/AArch64/AArch64PointerAuth.cpp | 180 +++++++++++++++++-
 llvm/lib/Target/AArch64/AArch64PointerAuth.h  | 116 +++++++++++
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp  |  32 ++++
 llvm/lib/Target/AArch64/AArch64Subtarget.h    |  27 +++
 .../AArch64/sign-return-address-tailcall.ll   | 121 ++++++++++++
 10 files changed, 534 insertions(+), 31 deletions(-)
 create mode 100644 llvm/lib/Target/AArch64/AArch64PointerAuth.h
 create mode 100644 llvm/test/CodeGen/AArch64/sign-return-address-tailcall.ll

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 78f85faaf69bf96..e68d67c6e78de2c 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -269,14 +269,10 @@ STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
 static int64_t getArgumentStackToRestore(MachineFunction &MF,
                                          MachineBasicBlock &MBB) {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  bool IsTailCallReturn = false;
-  if (MBB.end() != MBBI) {
-    unsigned RetOpcode = MBBI->getOpcode();
-    IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
-                       RetOpcode == AArch64::TCRETURNri ||
-                       RetOpcode == AArch64::TCRETURNriBTI;
-  }
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  bool IsTailCallReturn = (MBB.end() != MBBI)
+                              ? AArch64InstrInfo::isTailCallReturnInst(*MBBI)
+                              : false;
 
   int64_t ArgumentPopSize = 0;
   if (IsTailCallReturn) {
@@ -300,7 +296,6 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF,
 static bool produceCompactUnwindFrame(MachineFunction &MF);
 static bool needsWinCFI(const MachineFunction &MF);
 static StackOffset getSVEStackSize(const MachineFunction &MF);
-static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF);
 
 /// Returns true if a homogeneous prolog or epilog code can be emitted
 /// for the size optimization. If possible, a frame helper call is injected.
@@ -617,7 +612,7 @@ void AArch64FrameLowering::resetCFIToInitialState(
   }
 
   // Shadow call stack uses X18, reset it.
-  if (needsShadowCallStackPrologueEpilogue(MF))
+  if (MFI.needsShadowCallStackPrologueEpilogue(MF))
     insertCFISameValue(CFIDesc, MF, MBB, InsertPt,
                        TRI.getDwarfRegNum(AArch64::X18, true));
 
@@ -1290,19 +1285,6 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
   }
 }
 
-static bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF) {
-  if (!(llvm::any_of(
-            MF.getFrameInfo().getCalleeSavedInfo(),
-            [](const auto &Info) { return Info.getReg() == AArch64::LR; }) &&
-        MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)))
-    return false;
-
-  if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
-    report_fatal_error("Must reserve x18 to use shadow call stack");
-
-  return true;
-}
-
 static void emitShadowCallStackPrologue(const TargetInstrInfo &TII,
                                         MachineFunction &MF,
                                         MachineBasicBlock &MBB,
@@ -1414,7 +1396,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   DebugLoc DL;
 
   const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
-  if (needsShadowCallStackPrologueEpilogue(MF))
+  if (MFnI.needsShadowCallStackPrologueEpilogue(MF))
     emitShadowCallStackPrologue(*TII, MF, MBB, MBBI, DL, NeedsWinCFI,
                                 MFnI.needsDwarfUnwindInfo(MF));
 
@@ -1945,7 +1927,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
       if (NeedsWinCFI)
         HasWinCFI = true; // AArch64PointerAuth pass will insert SEH_PACSignLR
     }
-    if (needsShadowCallStackPrologueEpilogue(MF))
+    if (AFI->needsShadowCallStackPrologueEpilogue(MF))
       emitShadowCallStackEpilogue(*TII, MF, MBB, MBB.getFirstTerminator(), DL);
     if (EmitCFI)
       emitCalleeSavedGPRRestores(MBB, MBB.getFirstTerminator());
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 5a234bceb25ed0a..c804312da5369ef 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -14,6 +14,7 @@
 #include "AArch64InstrInfo.h"
 #include "AArch64FrameLowering.h"
 #include "AArch64MachineFunctionInfo.h"
+#include "AArch64PointerAuth.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
@@ -2490,6 +2491,20 @@ bool AArch64InstrInfo::isPairableLdStInst(const MachineInstr &MI) {
   }
 }
 
+bool AArch64InstrInfo::isTailCallReturnInst(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    assert((!MI.isCall() || !MI.isReturn()) &&
+           "Unexpected instruction - was a new tail call opcode introduced?");
+    return false;
+  case AArch64::TCRETURNdi:
+  case AArch64::TCRETURNri:
+  case AArch64::TCRETURNriBTI:
+  case AArch64::TCRETURNriALL:
+    return true;
+  }
+}
+
 unsigned AArch64InstrInfo::convertToFlagSettingOpc(unsigned Opc) {
   switch (Opc) {
   default:
@@ -8217,12 +8232,24 @@ AArch64InstrInfo::getOutliningCandidateInfo(
   // necessary. However, at this point we don't know if the outlined function
   // will have a RET instruction so we assume the worst.
   const TargetRegisterInfo &TRI = getRegisterInfo();
+  // Performing a tail call may require extra checks when PAuth is enabled.
+  // If PAuth is disabled, set it to zero for uniformity.
+  unsigned NumBytesToCheckLRInTCEpilogue = 0;
   if (FirstCand.getMF()
           ->getInfo<AArch64FunctionInfo>()
           ->shouldSignReturnAddress(true)) {
     // One PAC and one AUT instructions
     NumBytesToCreateFrame += 8;
 
+    // PAuth is enabled - set extra tail call cost, if any.
+    auto LRCheckMethod = Subtarget.getAuthenticatedLRCheckMethod();
+    NumBytesToCheckLRInTCEpilogue =
+        AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod);
+    // Checking the authenticated LR value may significantly impact
+    // SequenceSize, so account for it for more precise results.
+    if (isTailCallReturnInst(*RepeatedSequenceLocs[0].back()))
+      SequenceSize += NumBytesToCheckLRInTCEpilogue;
+
     // We have to check if sp modifying instructions would get outlined.
     // If so we only allow outlining if sp is unchanged overall, so matching
     // sub and add instructions are okay to outline, all other sp modifications
@@ -8393,7 +8420,8 @@ AArch64InstrInfo::getOutliningCandidateInfo(
   if (RepeatedSequenceLocs[0].back()->isTerminator()) {
     FrameID = MachineOutlinerTailCall;
     NumBytesToCreateFrame = 0;
-    SetCandidateCallInfo(MachineOutlinerTailCall, 4);
+    unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
+    SetCandidateCallInfo(MachineOutlinerTailCall, NumBytesForCall);
   }
 
   else if (LastInstrOpcode == AArch64::BL ||
@@ -8402,7 +8430,7 @@ AArch64InstrInfo::getOutliningCandidateInfo(
             !HasBTI)) {
     // FIXME: Do we need to check if the code after this uses the value of LR?
     FrameID = MachineOutlinerThunk;
-    NumBytesToCreateFrame = 0;
+    NumBytesToCreateFrame = NumBytesToCheckLRInTCEpilogue;
     SetCandidateCallInfo(MachineOutlinerThunk, 4);
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index f5874d7856f8d24..4a40b2fa122159f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -129,6 +129,9 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
   /// Return true if pairing the given load or store may be paired with another.
   static bool isPairableLdStInst(const MachineInstr &MI);
 
+  /// Returns true if MI is one of the TCRETURN* instructions.
+  static bool isTailCallReturnInst(const MachineInstr &MI);
+
   /// Return the opcode that set flags when possible.  The caller is
   /// responsible for ensuring the opc has a flag setting equivalent.
   static unsigned convertToFlagSettingOpc(unsigned Opc);
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
index 961a19317d6660b..7bb5041b8ba9481 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -122,11 +122,27 @@ bool AArch64FunctionInfo::shouldSignReturnAddress(bool SpillsLR) const {
   return SpillsLR;
 }
 
+static bool isLRSpilled(const MachineFunction &MF) {
+  return llvm::any_of(
+      MF.getFrameInfo().getCalleeSavedInfo(),
+      [](const auto &Info) { return Info.getReg() == AArch64::LR; });
+}
+
 bool AArch64FunctionInfo::shouldSignReturnAddress(
     const MachineFunction &MF) const {
-  return shouldSignReturnAddress(llvm::any_of(
-      MF.getFrameInfo().getCalleeSavedInfo(),
-      [](const auto &Info) { return Info.getReg() == AArch64::LR; }));
+  return shouldSignReturnAddress(isLRSpilled(MF));
+}
+
+bool AArch64FunctionInfo::needsShadowCallStackPrologueEpilogue(
+    MachineFunction &MF) const {
+  if (!(isLRSpilled(MF) &&
+        MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack)))
+    return false;
+
+  if (!MF.getSubtarget<AArch64Subtarget>().isXRegisterReserved(18))
+    report_fatal_error("Must reserve x18 to use shadow call stack");
+
+  return true;
 }
 
 bool AArch64FunctionInfo::needsDwarfUnwindInfo(
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 8df95ff1e6eaea2..0b8bfb04a572c77 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -431,6 +431,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   bool shouldSignReturnAddress(const MachineFunction &MF) const;
   bool shouldSignReturnAddress(bool SpillsLR) const;
 
+  bool needsShadowCallStackPrologueEpilogue(MachineFunction &MF) const;
+
   bool shouldSignWithBKey() const { return SignWithBKey; }
   bool isMTETagged() const { return IsMTETagged; }
 
diff --git a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
index f8f11b43e35be51..f9b3027c35bb3dd 100644
--- a/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PointerAuth.cpp
@@ -6,7 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AArch64PointerAuth.h"
+
 #include "AArch64.h"
+#include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64Subtarget.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -14,6 +17,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 
 using namespace llvm;
+using namespace llvm::AArch64PAuth;
 
 #define AARCH64_POINTER_AUTH_NAME "AArch64 Pointer Authentication"
 
@@ -30,13 +34,19 @@ class AArch64PointerAuth : public MachineFunctionPass {
   StringRef getPassName() const override { return AARCH64_POINTER_AUTH_NAME; }
 
 private:
+  /// An immediate operand passed to BRK instruction, if it is ever emitted.
+  const unsigned BrkOperand = 0xc471;
+
   const AArch64Subtarget *Subtarget = nullptr;
   const AArch64InstrInfo *TII = nullptr;
+  const AArch64RegisterInfo *TRI = nullptr;
 
   void signLR(MachineFunction &MF, MachineBasicBlock::iterator MBBI) const;
 
   void authenticateLR(MachineFunction &MF,
                       MachineBasicBlock::iterator MBBI) const;
+
+  bool checkAuthenticatedLR(MachineBasicBlock::iterator TI) const;
 };
 
 } // end anonymous namespace
@@ -132,22 +142,179 @@ void AArch64PointerAuth::authenticateLR(
   }
 }
 
+namespace {
+
+// Mark dummy LDR instruction as volatile to prevent removing it as dead code.
+MachineMemOperand *createCheckMemOperand(MachineFunction &MF,
+                                         const AArch64Subtarget &Subtarget) {
+  MachinePointerInfo PointerInfo(Subtarget.getAddressCheckPSV());
+  auto MOVolatileLoad =
+      MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
+
+  return MF.getMachineMemOperand(PointerInfo, MOVolatileLoad, 4, Align(4));
+}
+
+} // namespace
+
+MachineBasicBlock &llvm::AArch64PAuth::checkAuthenticatedRegister(
+    MachineBasicBlock::iterator MBBI, AuthCheckMethod Method,
+    Register AuthenticatedReg, Register TmpReg, bool UseIKey, unsigned BrkImm) {
+
+  MachineBasicBlock &MBB = *MBBI->getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64InstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  // First, handle the methods not requiring creating extra MBBs.
+  switch (Method) {
+  default:
+    break;
+  case AuthCheckMethod::None:
+    return MBB;
+  case AuthCheckMethod::DummyLoad:
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDRWui), getWRegFromXReg(TmpReg))
+        .addReg(AArch64::LR)
+        .addImm(0)
+        .addMemOperand(createCheckMemOperand(MF, Subtarget));
+    return MBB;
+  }
+
+  // Control flow has to be changed, so arrange new MBBs.
+
+  // At now, at least an AUT* instruction is expected before MBBI
+  assert(MBBI != MBB.begin() &&
+         "Cannot insert the check at the very beginning of MBB");
+  // The block to insert check into.
+  MachineBasicBlock *CheckBlock = &MBB;
+  // The remaining part of the original MBB that is executed on success.
+  MachineBasicBlock *SuccessBlock = MBB.splitAt(*std::prev(MBBI));
+
+  // The block that explicitly generates a break-point exception on failure.
+  MachineBasicBlock *BreakBlock =
+      MF.CreateMachineBasicBlock(MBB.getBasicBlock());
+  MF.push_back(BreakBlock);
+  MBB.splitSuccessor(SuccessBlock, BreakBlock);
+
+  assert(CheckBlock->getFallThrough() == SuccessBlock);
+  BuildMI(BreakBlock, DL, TII->get(AArch64::BRK)).addImm(BrkImm);
+
+  switch (Method) {
+  case AuthCheckMethod::None:
+  case AuthCheckMethod::DummyLoad:
+    llvm_unreachable("Should be handled above");
+  case AuthCheckMethod::HighBitsNoTBI:
+    BuildMI(CheckBlock, DL, TII->get(AArch64::EORXrs), TmpReg)
+        .addReg(AuthenticatedReg)
+        .addReg(AuthenticatedReg)
+        .addImm(1);
+    BuildMI(CheckBlock, DL, TII->get(AArch64::TBNZX))
+        .addReg(TmpReg)
+        .addImm(62)
+        .addMBB(BreakBlock);
+    return *SuccessBlock;
+  case AuthCheckMethod::XPACHint:
+    assert(AuthenticatedReg == AArch64::LR &&
+           "XPACHint mode is only compatible with checking the LR register");
+    assert(UseIKey && "XPACHint mode is only compatible with I-keys");
+    BuildMI(CheckBlock, DL, TII->get(AArch64::ORRXrs), TmpReg)
+        .addReg(AArch64::XZR)
+        .addReg(AArch64::LR)
+        .addImm(0);
+    BuildMI(CheckBlock, DL, TII->get(AArch64::XPACLRI));
+    BuildMI(CheckBlock, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
+        .addReg(TmpReg)
+        .addReg(AArch64::LR)
+        .addImm(0);
+    BuildMI(CheckBlock, DL, TII->get(AArch64::Bcc))
+        .addImm(AArch64CC::NE)
+        .addMBB(BreakBlock);
+    return *SuccessBlock;
+  }
+}
+
+unsigned llvm::AArch64PAuth::getCheckerSizeInBytes(AuthCheckMethod Method) {
+  switch (Method) {
+  case AuthCheckMethod::None:
+    return 0;
+  case AuthCheckMethod::DummyLoad:
+    return 4;
+  case AuthCheckMethod::HighBitsNoTBI:
+    return 12;
+  case AuthCheckMethod::XPACHint:
+    return 20;
+  }
+}
+
+bool AArch64PointerAuth::checkAuthenticatedLR(
+    MachineBasicBlock::iterator TI) const {
+  AuthCheckMethod Method = Subtarget->getAuthenticatedLRCheckMethod();
+
+  if (Method == AuthCheckMethod::None)
+    return false;
+
+  // FIXME If FEAT_FPAC is implemented by the CPU, this check can be skipped.
+
+  assert(!TI->getMF()->hasWinCFI() && "WinCFI is not yet supported");
+
+  // The following code may create a signing oracle:
+  //
+  //   <authenticate LR>
+  //   TCRETURN          ; the callee may sign and spill the LR in its prologue
+  //
+  // To avoid generating a signing oracle, check the authenticated value
+  // before possibly re-signing it in the callee, as follows:
+  //
+  //   <authenticate LR>
+  //   <check if LR contains a valid address>
+  //   b.<cond> break_block
+  // ret_block:
+  //   TCRETURN
+  // break_block:
+  //   brk <BrkOperand>
+  //
+  // or just
+  //
+  //   <authenticate LR>
+  //   ldr tmp, [lr]
+  //   TCRETURN
+
+  // TmpReg is chosen assuming X16 and X17 are dead after TI.
+  assert(AArch64InstrInfo::isTailCallReturnInst(*TI) &&
+         "Tail call is expected");
+  Register TmpReg =
+      TI->readsRegister(AArch64::X16, TRI) ? AArch64::X17 : AArch64::X16;
+  assert(!TI->readsRegister(TmpReg, TRI) &&
+         "More than a single register is used by TCRETURN");
+
+  checkAuthenticatedRegister(TI, Method, AArch64::LR, TmpReg, /*UseIKey=*/true,
+                             BrkOperand);
+
+  return true;
+}
+
 bool AArch64PointerAuth::runOnMachineFunction(MachineFunction &MF) {
-  if (!MF.getInfo<AArch64FunctionInfo>()->shouldSignReturnAddress(true))
+  const auto *MFnI = MF.getInfo<AArch64FunctionInfo>();
+  if (!MFnI->shouldSignReturnAddress(true))
     return false;
 
   Subtarget = &MF.getSubtarget<AArch64Subtarget>();
   TII = Subtarget->getInstrInfo();
+  TRI = Subtarget->getRegisterInfo();
 
   SmallVector<MachineBasicBlock::iterator> DeletedInstrs;
+  SmallVector<MachineBasicBlock::iterator> TailCallInstrs;
+
   bool Modified = false;
+  bool HasAuthenticationInstrs = false;
 
   for (auto &MBB : MF) {
     for (auto &MI : MBB) {
       auto It = MI.getIterator();
       switch (MI.getOpcode()) {
       default:
-        // do nothing
+        if (AArch64InstrInfo::isTailCallReturnInst(MI))
+          TailCallInstrs.push_back(It);
         break;
       case AArch64::PAUTH_PROLOGUE:
         signLR(MF, It);
@@ -158,11 +325,20 @@ bool AArch64PointerAuth::runOnMachineFunction(MachineFunction &MF) {
         authenticateLR(MF, It);
         DeletedInstrs.push_back(It);
         Modified = true;
+        HasAuthenticationInstrs = true;
         break;
       }
     }
   }
 
+  // FIXME Do we need to emit any PAuth-related epilogue code at all
+  //       when SCS is enabled?
+  if (HasAuthenticationInstrs &&
+      !MFnI->needsShadowCallStackPrologueEpilogue(MF)) {
+    for (auto TailCall : TailCallInstrs)
+      Modified |= checkAuthenticatedLR(TailCall);
+  }
+
   for (auto MI : DeletedInstrs)
     MI->eraseFromParent();
 
diff --git a/llvm/lib/Target/AArch64/AArch64PointerAuth.h b/llvm/lib/Target/AArch64/AArch64PointerAuth.h
new file mode 100644
index 000000000000000..e1ceaed58abe47c
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64PointerAuth.h
@@ -0,0 +1,116 @@
+//===-- AArch64PointerAuth.h -- Harden code using PAuth ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64POINTERAUTH_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64POINTERAUTH_H
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/Register.h"
+
+namespace llvm {
+namespace AArch64PAuth {
+
+/// Variants of check performed on an authenticated pointer.
+///
+/// In cases such as authenticating the LR value when performing a tail call
+/// or when re-signing a signed pointer with a different signing schema,
+/// a failed authentication may not generate an exception on its own and may
+/// create an authentication or signing oracle if not checked explicitly.
+///
+/// A number of check methods modify control flow in a similar way by
+/// rewriting the code
+///
+/// ```
+///   <authenticate LR>
+///   <more instructions>
+/// ```
+///
+/// as follows:
+///
+/// ```
+///   <authenticate LR>
+///   <method-specific checker>
+/// ret_block:
+///   <more instructions>
+///   ...
+///
+/// break_block:
+///   brk <code>
+/// ```
+enum class AuthCheckMethod {
+  /// Do not check the value at all
+  None,
+  /// Perform a load to a temporary register
+  DummyLoad,
+  /// Check by comparing bits 62 and 61 of the authenticated address.
+  ///
+  /// This method modifies control flow and inserts the following checker:
+  ///
+  /// ```
+  ///   eor Xtmp, Xn, Xn, lsl #1
+  ///   tbnz Xtmp, #62, break_block
+  /// ```
+  HighBitsNoTBI,
+  /// Check by comparing the authenticated value with an XPAC-ed one without
+  /// using PAuth instructions not encoded as HINT. Can only be applied to LR.
+  ///
+  /// This method modifies control flow and inserts the following checker:
+  ///
+  /// ```
+  ///   mov Xtmp, LR
+  ///   xpaclri           ; encoded as "hint #7"
+  ///   ; Note: at this point, the LR register contains the address as if
+  ///   ; the authentication succeeded and the temporary register contains the
+  ///   ; *real* result of authentication.
+  ///   cmp Xtmp, LR
+  ///   b.ne break_block
+  /// ```
+  XPACHint,
+};
+
+#define AUTH_CHECK_METHOD_CL_VALUES_COMMON                                     \
+      clEnumValN(AArch64PAuth::AuthCheckMethod::None, "none",                  \
+                 "Do not check authenticated address"),                        \
+      clEnumValN(AArch64PAuth::AuthCheckMethod::DummyLoad, "load",             \
+                 "Perform dummy load from authenticated address"),             \
+      clEnumValN(AArch64PAuth::AuthCheckMethod::HighBitsNoTBI,                 \
+                 "high-bits-notbi",                                            \
+                 "Compare bits 62 and 61 of address (TBI should be disabled)")
+
+#define AUTH_CHECK_METHOD_CL_VALUES_LR                                         \
+      AUTH_CHECK_METHOD_CL_VALUES_COMMON,                                      \
+      clEnumValN(AArch64PAuth::AuthCheckMethod::XPACHint, "xpac-hint",         \
+                 "Compare with the result of XPACLRI")
+
+/// Explicitly checks that pointer authentication succeeded.
+///
+/// Assuming AuthenticatedReg contains a value returned by one of the AUT*
+/// instructions, check the value using Method just before the instruction
+/// pointed to by MBBI. If the check succeeds, execution proceeds to the
+/// instruction pointed to by MBBI, otherwise a CPU exception is generated.
+///
+/// Some of the methods may need to know if the pointer was authenticated
+/// using an I-key or D-key and which register can be used as temporary.
+/// If an explicit BRK instruction is used to generate an exception, BrkImm
+/// specifies its immediate operand.
+///
+/// \returns The machine basic block containing the code that is executed
+///          after the check succeeds.
+MachineBasicBlock &checkAuthenticatedRegister(MachineBasicBlock::iterator MBBI,
+                                              AuthCheckMethod Method,
+                                              Register AuthenticatedReg,
+                                              Register TmpReg, bool UseIKey,
+                                              unsigned BrkImm);
+
+/// Returns the number of bytes added by checkAuthenticatedRegister.
+unsigned getCheckerSizeInBytes(AuthCheckMethod Method);
+
+} // end namespace AArch64PAuth
+} // end namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 8946c0b71e2ba46..e3c3bff8e32984e 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -70,6 +70,13 @@ static cl::opt<bool> ForceStreamingCompatibleSVE(
         "Force the use of streaming-compatible SVE code for all functions"),
     cl::Hidden);
 
+static cl::opt<AArch64PAuth::AuthCheckMethod>
+    AuthenticatedLRCheckMethod("aarch64-authenticated-lr-check-method",
+                               cl::Hidden,
+                               cl::desc("Override the variant of check applied "
+                                        "to authenticated LR during tail call"),
+                               cl::values(AUTH_CHECK_METHOD_CL_VALUES_LR));
+
 unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const {
   if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0)
     return OverrideVectorInsertExtractBaseCost;
@@ -335,6 +342,8 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, StringRef CPU,
   // X29 is named FP, so we can't use TRI->getName to check X29.
   if (ReservedRegNames.count("X29") || ReservedRegNames.count("FP"))
     ReserveXRegisterForRA.set(29);
+
+  AddressCheckPSV.reset(new AddressCheckPseudoSourceValue(TM));
 }
 
 const CallLowering *AArch64Subtarget::getCallLowering() const {
@@ -490,3 +499,26 @@ bool AArch64Subtarget::isSVEAvailable() const{
   // as we don't yet support the feature in LLVM.
   return hasSVE() && !isStreaming() && !isStreamingCompatible();
 }
+
+// If return address signing is enabled, tail calls are emitted as follows:
+//
+// ```
+//   <authenticate LR>
+//   <check LR>
+//   TCRETURN          ; the callee may sign and spill the LR in its prologue
+// ```
+//
+// LR may require explicit checking because if FEAT_FPAC is not implemented
+// and LR was tampered with, then `<authenticate LR>` will not generate an
+// exception on its own. Later, if the callee spills the signed LR value and
+// neither FEAT_PAuth2 nor FEAT_EPAC are implemented, the valid PAC replaces
+// the higher bits of LR thus hiding the authentication failure.
+AArch64PAuth::AuthCheckMethod
+AArch64Subtarget::getAuthenticatedLRCheckMethod() const {
+  if (AuthenticatedLRCheckMethod.getNumOccurrences())
+    return AuthenticatedLRCheckMethod;
+
+  // At now, use None by default because checks may introduce an unexpected
+  // performance regression or incompatibility with execute-only mappings.
+  return AArch64PAuth::AuthCheckMethod::None;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 5d0fd7f9f45b59c..b91c5c81ed4d274 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -16,6 +16,7 @@
 #include "AArch64FrameLowering.h"
 #include "AArch64ISelLowering.h"
 #include "AArch64InstrInfo.h"
+#include "AArch64PointerAuth.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64SelectionDAGInfo.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
@@ -432,6 +433,32 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
       return "__security_check_cookie_arm64ec";
     return "__security_check_cookie";
   }
+
+  /// Choose a method of checking LR before performing a tail call.
+  AArch64PAuth::AuthCheckMethod getAuthenticatedLRCheckMethod() const;
+
+  const PseudoSourceValue *getAddressCheckPSV() const {
+    return AddressCheckPSV.get();
+  }
+
+private:
+  /// Pseudo value representing memory load performed to check an address.
+  ///
+  /// This load operation is solely used for its side-effects: if the address
+  /// is not mapped (or not readable), it triggers CPU exception, otherwise
+  /// execution proceeds and the value is not used.
+  class AddressCheckPseudoSourceValue : public PseudoSourceValue {
+  public:
+    AddressCheckPseudoSourceValue(const TargetMachine &TM)
+        : PseudoSourceValue(TargetCustom, TM) {}
+
+    bool isConstant(const MachineFrameInfo *) const override { return false; }
+    bool isAliased(const MachineFrameInfo *) const override { return true; }
+    bool mayAlias(const MachineFrameInfo *) const override { return true; }
+    void printCustom(raw_ostream &OS) const override { OS << "AddressCheck"; }
+  };
+
+  std::unique_ptr<AddressCheckPseudoSourceValue> AddressCheckPSV;
 };
 } // End llvm namespace
 
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address-tailcall.ll b/llvm/test/CodeGen/AArch64/sign-return-address-tailcall.ll
new file mode 100644
index 000000000000000..ec04e553cac6e37
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sign-return-address-tailcall.ll
@@ -0,0 +1,121 @@
+; RUN: llc -mtriple=aarch64 -asm-verbose=0 < %s | FileCheck -DAUTIASP="hint #29" --check-prefixes=COMMON %s
+; RUN: llc -mtriple=aarch64 -asm-verbose=0 -aarch64-authenticated-lr-check-method=load                   < %s | FileCheck -DAUTIASP="hint #29" --check-prefixes=COMMON,LDR %s
+; RUN: llc -mtriple=aarch64 -asm-verbose=0 -aarch64-authenticated-lr-check-method=high-bits-notbi        < %s | FileCheck -DAUTIASP="hint #29" --check-prefixes=COMMON,BITS-NOTBI,BRK %s
+; RUN: llc -mtriple=aarch64 -asm-verbose=0 -aarch64-authenticated-lr-check-method=xpac-hint              < %s | FileCheck -DAUTIASP="hint #29" -DXPACLRI="hint #7" --check-prefixes=COMMON,XPAC,BRK %s
+; RUN: llc -mtriple=aarch64 -asm-verbose=0 -aarch64-authenticated-lr-check-method=xpac-hint -mattr=v8.3a < %s | FileCheck -DAUTIASP="autiasp"  -DXPACLRI="xpaclri" --check-prefixes=COMMON,XPAC,BRK %s
+
+define i32 @tailcall_direct() "sign-return-address"="non-leaf" {
+; COMMON-LABEL: tailcall_direct:
+; COMMON:         str x30, [sp, #-16]!
+; COMMON:         ldr x30, [sp], #16
+;
+; COMMON-NEXT:    [[AUTIASP]]
+;
+; LDR-NEXT:       ldr w16, [x30]
+;
+; BITS-NOTBI-NEXT: eor x16, x30, x30, lsl #1
+; BITS-NOTBI-NEXT: tbnz x16, #62, .[[FAIL:LBB[_0-9]+]]
+;
+; XPAC-NEXT:      mov x16, x30
+; XPAC-NEXT:      [[XPACLRI]]
+; XPAC-NEXT:      cmp x16, x30
+; XPAC-NEXT:      b.ne .[[FAIL:LBB[_0-9]+]]
+;
+; COMMON-NEXT:    b callee
+; BRK-NEXT:     .[[FAIL]]:
+; BRK-NEXT:       brk #0xc471
+  tail call void asm sideeffect "", "~{lr}"()
+  %call = tail call i32 @callee()
+  ret i32 %call
+}
+
+define i32 @tailcall_indirect(ptr %fptr) "sign-return-address"="non-leaf" {
+; COMMON-LABEL: tailcall_indirect:
+; COMMON:         str x30, [sp, #-16]!
+; COMMON:         ldr x30, [sp], #16
+;
+; COMMON-NEXT:    [[AUTIASP]]
+;
+; LDR-NEXT:       ldr w16, [x30]
+;
+; BITS-NOTBI-NEXT: eor x16, x30, x30, lsl #1
+; BITS-NOTBI-NEXT: tbnz x16, #62, .[[FAIL:LBB[_0-9]+]]
+;
+; XPAC-NEXT:      mov x16, x30
+; XPAC-NEXT:      [[XPACLRI]]
+; XPAC-NEXT:      cmp x16, x30
+; XPAC-NEXT:      b.ne .[[FAIL:LBB[_0-9]+]]
+;
+; COMMON-NEXT:    br x0
+; BRK-NEXT:     .[[FAIL]]:
+; BRK-NEXT:       brk #0xc471
+  tail call void asm sideeffect "", "~{lr}"()
+  %call = tail call i32 %fptr()
+  ret i32 %call
+}
+
+define i32 @tailcall_direct_noframe() "sign-return-address"="non-leaf" {
+; COMMON-LABEL: tailcall_direct_noframe:
+; COMMON-NEXT:    .cfi_startproc
+; COMMON-NEXT:    b callee
+  %call = tail call i32 @callee()
+  ret i32 %call
+}
+
+define i32 @tailcall_indirect_noframe(ptr %fptr) "sign-return-address"="non-leaf" {
+; COMMON-LABEL: tailcall_indirect_noframe:
+; COMMON-NEXT:    .cfi_startproc
+; COMMON-NEXT:    br x0
+  %call = tail call i32 %fptr()
+  ret i32 %call
+}
+
+define i32 @tailcall_direct_noframe_sign_all() "sign-return-address"="all" {
+; COMMON-LABEL: tailcall_direct_noframe_sign_all:
+; COMMON-NOT:     str{{.*}}x30
+; COMMON-NOT:     ldr{{.*}}x30
+;
+; COMMON:         [[AUTIASP]]
+;
+; LDR-NEXT:       ldr w16, [x30]
+;
+; BITS-NOTBI-NEXT: eor x16, x30, x30, lsl #1
+; BITS-NOTBI-NEXT: tbnz x16, #62, .[[FAIL:LBB[_0-9]+]]
+;
+; XPAC-NEXT:      mov x16, x30
+; XPAC-NEXT:      [[XPACLRI]]
+; XPAC-NEXT:      cmp x16, x30
+; XPAC-NEXT:      b.ne .[[FAIL:LBB[_0-9]+]]
+;
+; COMMON-NEXT:    b callee
+; BRK-NEXT:     .[[FAIL]]:
+; BRK-NEXT:       brk #0xc471
+  %call = tail call i32 @callee()
+  ret i32 %call
+}
+
+define i32 @tailcall_indirect_noframe_sign_all(ptr %fptr) "sign-return-address"="all" {
+; COMMON-LABEL: tailcall_indirect_noframe_sign_all:
+; COMMON-NOT:     str{{.*}}x30
+; COMMON-NOT:     ldr{{.*}}x30
+;
+; COMMON:         [[AUTIASP]]
+;
+; LDR-NEXT:       ldr w16, [x30]
+;
+; BITS-NOTBI-NEXT: eor x16, x30, x30, lsl #1
+; BITS-NOTBI-NEXT: tbnz x16, #62, .[[FAIL:LBB[_0-9]+]]
+;
+; XPAC-NEXT:      mov x16, x30
+; XPAC-NEXT:      [[XPACLRI]]
+; XPAC-NEXT:      cmp x16, x30
+; XPAC-NEXT:      b.ne .[[FAIL:LBB[_0-9]+]]
+;
+; COMMON-NEXT:    br x0
+; BRK-NEXT:     .[[FAIL]]:
+; BRK-NEXT:       brk #0xc471
+  %call = tail call i32 %fptr()
+  ret i32 %call
+}
+
+declare i32 @callee()

>From 33b82bd816b5c9ce29c6008ebf419b31c32f3b8e Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 11 Oct 2023 15:40:40 +0100
Subject: [PATCH 05/38] [SlotIndexes] Simplify SlotIndex() and isSameInstr().
 NFC.

Avoid using lie.getPointer() directly when there are helper methods we
can use.
---
 llvm/include/llvm/CodeGen/SlotIndexes.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SlotIndexes.h b/llvm/include/llvm/CodeGen/SlotIndexes.h
index 1a8d117322fcc6e..1517d33e65a3801 100644
--- a/llvm/include/llvm/CodeGen/SlotIndexes.h
+++ b/llvm/include/llvm/CodeGen/SlotIndexes.h
@@ -125,8 +125,7 @@ class raw_ostream;
 
     // Construct a new slot index from the given one, and set the slot.
     SlotIndex(const SlotIndex &li, Slot s) : lie(li.listEntry(), unsigned(s)) {
-      assert(lie.getPointer() != nullptr &&
-             "Attempt to construct index with 0 pointer.");
+      assert(isValid() && "Attempt to construct index with 0 pointer.");
     }
 
     /// Returns true if this is a valid index. Invalid indices do
@@ -178,7 +177,7 @@ class raw_ostream;
 
     /// isSameInstr - Return true if A and B refer to the same instruction.
     static bool isSameInstr(SlotIndex A, SlotIndex B) {
-      return A.lie.getPointer() == B.lie.getPointer();
+      return A.listEntry() == B.listEntry();
     }
 
     /// isEarlierInstr - Return true if A refers to an instruction earlier than

>From 30620af003261d89ee2376037bf6a646d6ef241f Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin at google.com>
Date: Wed, 11 Oct 2023 08:02:15 -0700
Subject: [PATCH 06/38] [mlgo][nfc] Remove / fix vestigial references to
 Tensorflow

Some references in comments are unnecessarily specific, for historical reasons.
---
 llvm/include/llvm/Analysis/InlineAdvisor.h         |  5 ++---
 .../include/llvm/Analysis/InlineModelFeatureMaps.h |  4 ++--
 llvm/include/llvm/Analysis/MLModelRunner.h         |  4 +++-
 .../llvm/Analysis/ModelUnderTrainingRunner.h       |  5 +++--
 llvm/include/llvm/Analysis/TensorSpec.h            | 14 +++++++++-----
 llvm/include/llvm/Analysis/Utils/TFUtils.h         |  4 ++--
 llvm/include/llvm/Config/llvm-config.h.cmake       |  2 +-
 llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp |  2 +-
 llvm/lib/Analysis/TFLiteUtils.cpp                  |  4 ++--
 9 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/Analysis/InlineAdvisor.h b/llvm/include/llvm/Analysis/InlineAdvisor.h
index 53c018d15cd71a6..2740106bc7db80f 100644
--- a/llvm/include/llvm/Analysis/InlineAdvisor.h
+++ b/llvm/include/llvm/Analysis/InlineAdvisor.h
@@ -36,9 +36,8 @@ struct ReplayInlinerSettings;
 ///
 /// - Development mode, for training new models.
 /// In this mode, we trade off runtime performance for flexibility. This mode
-/// requires the full C Tensorflow API library, and evaluates models
-/// dynamically. This mode also permits generating training logs, for offline
-/// training.
+/// requires the TFLite library, and evaluates models dynamically. This mode
+/// also permits generating training logs, for offline training.
 ///
 /// - Dynamically load an advisor via a plugin (PluginInlineAdvisorAnalysis)
 enum class InliningAdvisorMode : int { Default, Release, Development };
diff --git a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
index 77ae60059ce9ebd..06925e620bdd615 100644
--- a/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
+++ b/llvm/include/llvm/Analysis/InlineModelFeatureMaps.h
@@ -92,8 +92,8 @@ constexpr bool isHeuristicInlineCostFeature(InlineCostFeatureIndex Feature) {
 
 // List of features. Each feature is defined through a triple:
 // - the name of an enum member, which will be the feature index
-// - a textual name, used for Tensorflow model binding (so it needs to match the
-// names used by the Tensorflow model)
+// - a textual name, used for ML model binding (so it needs to match the
+// names used by the ML model).
 // - a documentation description. Currently, that is not used anywhere
 // programmatically, and serves as workaround to inability of inserting comments
 // in macros.
diff --git a/llvm/include/llvm/Analysis/MLModelRunner.h b/llvm/include/llvm/Analysis/MLModelRunner.h
index 903411fbdf7ecf1..21f155de85aecbb 100644
--- a/llvm/include/llvm/Analysis/MLModelRunner.h
+++ b/llvm/include/llvm/Analysis/MLModelRunner.h
@@ -17,7 +17,9 @@ namespace llvm {
 class LLVMContext;
 
 /// MLModelRunner interface: abstraction of a mechanism for evaluating a
-/// tensorflow "saved model".
+/// ML model. More abstractly, evaluating a function that has as tensors as
+/// arguments, described via TensorSpecs, and returns a tensor. Currently, the
+/// latter is assumed to be a scalar, in absence of more elaborate scenarios.
 /// NOTE: feature indices are expected to be consistent all accross
 /// MLModelRunners (pertaining to the same model), and also Loggers (see
 /// TFUtils.h)
diff --git a/llvm/include/llvm/Analysis/ModelUnderTrainingRunner.h b/llvm/include/llvm/Analysis/ModelUnderTrainingRunner.h
index 7e29bbd2be0961e..1a79f2c36c3f7d8 100644
--- a/llvm/include/llvm/Analysis/ModelUnderTrainingRunner.h
+++ b/llvm/include/llvm/Analysis/ModelUnderTrainingRunner.h
@@ -23,9 +23,10 @@
 
 namespace llvm {
 
-/// ModelUnderTrainingRunner - training mode implementation. It uses TF C APIs
+/// ModelUnderTrainingRunner - training mode implementation. It uses TFLite
 /// to dynamically load and evaluate a TF SavedModel
-/// (https://www.tensorflow.org/guide/saved_model). Runtime performance is
+/// (https://www.tensorflow.org/guide/saved_model) converted to TFLite. see
+/// lib/Analysis/models/saved-model-to-tflite.py. Runtime performance is
 /// sacrificed for ease of use while training.
 class ModelUnderTrainingRunner final : public MLModelRunner {
 public:
diff --git a/llvm/include/llvm/Analysis/TensorSpec.h b/llvm/include/llvm/Analysis/TensorSpec.h
index c50507b7a6b1144..d6b23cf56f6a5ff 100644
--- a/llvm/include/llvm/Analysis/TensorSpec.h
+++ b/llvm/include/llvm/Analysis/TensorSpec.h
@@ -26,11 +26,15 @@ namespace llvm {
 /// Machine Learning on Heterogeneous Distributed Systems", section 4.2, para 2:
 /// https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
 ///
-/// Known tensor types. The left part is the C type, the right is a name we
-/// can use to identify the type (to implement TensorSpec equality checks), and
-/// to use, if needed, when mapping to an underlying evaluator's type system.
-/// The main requirement is that the C type we use has the same size and
-/// encoding (e.g. endian-ness) as the one used by the evaluator.
+/// Note that the design is motivated by Tensorflow, but it is not intended to
+/// be Tensorflow-specific.
+///
+/// Known tensor types. The left part is the C type, the
+/// right is a name we can use to identify the type (to implement TensorSpec
+/// equality checks), and to use, if needed, when mapping to an underlying
+/// evaluator's type system. The main requirement is that the C type we use has
+/// the same size and encoding (e.g. endian-ness) as the one used by the
+/// evaluator.
 #define SUPPORTED_TENSOR_TYPES(M)                                              \
   M(float, Float)                                                              \
   M(double, Double)                                                            \
diff --git a/llvm/include/llvm/Analysis/Utils/TFUtils.h b/llvm/include/llvm/Analysis/Utils/TFUtils.h
index 04bb8af3a515a8b..817702b869e932e 100644
--- a/llvm/include/llvm/Analysis/Utils/TFUtils.h
+++ b/llvm/include/llvm/Analysis/Utils/TFUtils.h
@@ -1,4 +1,4 @@
-//===- TFUtils.h - utilities for tensorflow C API ---------------*- C++ -*-===//
+//===- TFUtils.h - utilities for TFLite -------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -93,7 +93,7 @@ class TFModelEvaluator final {
     return static_cast<T *>(getUntypedInput(Index));
   }
 
-  /// Returns true if the tensorflow model was loaded successfully, false
+  /// Returns true if the model was loaded successfully, false
   /// otherwise.
   bool isValid() const { return !!Impl; }
 
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index 8d0a1bc5dc56588..17b2d47fb6c43a3 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -98,7 +98,7 @@
 /* Define if zstd compression is available */
 #cmakedefine01 LLVM_ENABLE_ZSTD
 
-/* Define if LLVM is using tflite instead of libtensorflow */
+/* Define if LLVM is using tflite */
 #cmakedefine LLVM_HAVE_TFLITE
 
 /* Define to 1 if you have the <sysexits.h> header file. */
diff --git a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
index 456d58660680d7d..7d51302bcc1adb2 100644
--- a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements a model runner using Tensorflow C APIs, allowing the
+// This file implements a model runner using TFLite, allowing the
 // loading of a model from a command line option.
 //
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Analysis/TFLiteUtils.cpp b/llvm/lib/Analysis/TFLiteUtils.cpp
index b2862033e9cfbff..2762e22f28cef32 100644
--- a/llvm/lib/Analysis/TFLiteUtils.cpp
+++ b/llvm/lib/Analysis/TFLiteUtils.cpp
@@ -1,4 +1,4 @@
-//===- TFUtils.cpp - tensorflow evaluation utilities ----------------------===//
+//===- TFUtils.cpp - TFLite-based evaluation utilities --------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements utilities for interfacing with tensorflow C APIs.
+// This file implements utilities for interfacing with TFLite.
 //
 //===----------------------------------------------------------------------===//
 #include "llvm/Config/config.h"

>From e28fb6fe58be8a0aac78e7801b811c7031f2d6a8 Mon Sep 17 00:00:00 2001
From: chuongg3 <chuong.goh at arm.com>
Date: Wed, 11 Oct 2023 16:05:25 +0100
Subject: [PATCH 07/38] [AArch64][GlobalISel] Support more types for TRUNC
 (#66927)

G_TRUNC will get lowered into trunc(merge(trunc(unmerge),
trunc(unmerge))) if the source is larger than 128 bits or the truncation
is more than half of the current bit size.

Now mirrors ZEXT/SEXT code more closely for vector types.
---
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |   1 +
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  67 ++-
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  65 +--
 .../AArch64/GISel/AArch64LegalizerInfo.h      |   1 -
 .../AArch64/GlobalISel/legalize-xtn.mir       | 531 ++++++++++++++++++
 llvm/test/CodeGen/AArch64/xtn.ll              | 473 ++++++++++++++++
 llvm/test/CodeGen/AArch64/zext.ll             |  48 +-
 7 files changed, 1100 insertions(+), 86 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
 create mode 100644 llvm/test/CodeGen/AArch64/xtn.ll

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 8516a28d0052d21..86d3cb2bedb95b6 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -384,6 +384,7 @@ class LegalizerHelper {
   LegalizeResult lowerFunnelShiftAsShifts(MachineInstr &MI);
   LegalizeResult lowerFunnelShift(MachineInstr &MI);
   LegalizeResult lowerEXT(MachineInstr &MI);
+  LegalizeResult lowerTRUNC(MachineInstr &MI);
   LegalizeResult lowerRotateWithReverseRotate(MachineInstr &MI);
   LegalizeResult lowerRotate(MachineInstr &MI);
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index ec501083aaefae2..196da03733c7d00 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3766,6 +3766,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
   case G_SEXT:
   case G_ANYEXT:
     return lowerEXT(MI);
+  case G_TRUNC:
+    return lowerTRUNC(MI);
   GISEL_VECREDUCE_CASES_NONSEQ
     return lowerVectorReduction(MI);
   }
@@ -5110,13 +5112,7 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
     MI.eraseFromParent();
     return Legalized;
   }
-  case TargetOpcode::G_TRUNC: {
-    Observer.changingInstr(MI);
-    moreElementsVectorSrc(MI, MoreTy, 1);
-    moreElementsVectorDst(MI, MoreTy, 0);
-    Observer.changedInstr(MI);
-    return Legalized;
-  }
+  case TargetOpcode::G_TRUNC:
   case TargetOpcode::G_FPTRUNC:
   case TargetOpcode::G_FPEXT: {
     if (TypeIdx != 0)
@@ -6165,6 +6161,63 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerEXT(MachineInstr &MI) {
   return UnableToLegalize;
 }
 
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerTRUNC(MachineInstr &MI) {
+  // MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+  // Similar to how operand splitting is done in SelectiondDAG, we can handle
+  // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
+  //   %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
+  //   %lo16(<4 x s16>) = G_TRUNC %inlo
+  //   %hi16(<4 x s16>) = G_TRUNC %inhi
+  //   %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
+  //   %res(<8 x s8>) = G_TRUNC %in16
+
+  assert(MI.getOpcode() == TargetOpcode::G_TRUNC);
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  LLT SrcTy = MRI.getType(SrcReg);
+
+  if (DstTy.isVector() && isPowerOf2_32(DstTy.getNumElements()) &&
+      isPowerOf2_32(DstTy.getScalarSizeInBits()) &&
+      isPowerOf2_32(SrcTy.getNumElements()) &&
+      isPowerOf2_32(SrcTy.getScalarSizeInBits())) {
+    // Split input type.
+    LLT SplitSrcTy = SrcTy.changeElementCount(
+        SrcTy.getElementCount().divideCoefficientBy(2));
+
+    // First, split the source into two smaller vectors.
+    SmallVector<Register, 2> SplitSrcs;
+    extractParts(SrcReg, SplitSrcTy, 2, SplitSrcs);
+
+    // Truncate the splits into intermediate narrower elements.
+    LLT InterTy;
+    if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
+      InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2);
+    else
+      InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits());
+    for (unsigned I = 0; I < SplitSrcs.size(); ++I) {
+      SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0);
+    }
+
+    // Combine the new truncates into one vector
+    auto Merge = MIRBuilder.buildMergeLikeInstr(
+        DstTy.changeElementSize(InterTy.getScalarSizeInBits()), SplitSrcs);
+
+    // Truncate the new vector to the final result type
+    if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
+      MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), Merge.getReg(0));
+    else
+      MIRBuilder.buildCopy(MI.getOperand(0).getReg(), Merge.getReg(0));
+
+    MI.eraseFromParent();
+
+    return Legalized;
+  }
+  return UnableToLegalize;
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
   auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index e75f6e891adc334..378a8d0da4925d9 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -536,14 +536,22 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       });
 
   getActionDefinitionsBuilder(G_TRUNC)
+      .legalFor({{v2s32, v2s64}, {v4s16, v4s32}, {v8s8, v8s16}})
+      .moreElementsToNextPow2(0)
+      .clampMaxNumElements(0, s8, 8)
+      .clampMaxNumElements(0, s16, 4)
+      .clampMaxNumElements(0, s32, 2)
       .minScalarOrEltIf(
           [=](const LegalityQuery &Query) { return Query.Types[0].isVector(); },
           0, s8)
-      .customIf([=](const LegalityQuery &Query) {
+      .lowerIf([=](const LegalityQuery &Query) {
         LLT DstTy = Query.Types[0];
         LLT SrcTy = Query.Types[1];
-        return DstTy == v8s8 && SrcTy.getSizeInBits() > 128;
+        return DstTy.isVector() && (SrcTy.getSizeInBits() > 128 ||
+                                    (DstTy.getScalarSizeInBits() * 2 <
+                                     SrcTy.getScalarSizeInBits()));
       })
+
       .alwaysLegal();
 
   getActionDefinitionsBuilder(G_SEXT_INREG)
@@ -1002,8 +1010,6 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
     return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
   case TargetOpcode::G_GLOBAL_VALUE:
     return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
-  case TargetOpcode::G_TRUNC:
-    return legalizeVectorTrunc(MI, Helper);
   case TargetOpcode::G_SBFX:
   case TargetOpcode::G_UBFX:
     return legalizeBitfieldExtract(MI, MRI, Helper);
@@ -1102,54 +1108,6 @@ bool AArch64LegalizerInfo::legalizeRotate(MachineInstr &MI,
   return true;
 }
 
-static void extractParts(Register Reg, MachineRegisterInfo &MRI,
-                         MachineIRBuilder &MIRBuilder, LLT Ty, int NumParts,
-                         SmallVectorImpl<Register> &VRegs) {
-  for (int I = 0; I < NumParts; ++I)
-    VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
-  MIRBuilder.buildUnmerge(VRegs, Reg);
-}
-
-bool AArch64LegalizerInfo::legalizeVectorTrunc(
-    MachineInstr &MI, LegalizerHelper &Helper) const {
-  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
-  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
-  // Similar to how operand splitting is done in SelectiondDAG, we can handle
-  // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
-  //   %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
-  //   %lo16(<4 x s16>) = G_TRUNC %inlo
-  //   %hi16(<4 x s16>) = G_TRUNC %inhi
-  //   %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
-  //   %res(<8 x s8>) = G_TRUNC %in16
-
-  Register DstReg = MI.getOperand(0).getReg();
-  Register SrcReg = MI.getOperand(1).getReg();
-  LLT DstTy = MRI.getType(DstReg);
-  LLT SrcTy = MRI.getType(SrcReg);
-  assert(llvm::has_single_bit<uint32_t>(DstTy.getSizeInBits()) &&
-         llvm::has_single_bit<uint32_t>(SrcTy.getSizeInBits()));
-
-  // Split input type.
-  LLT SplitSrcTy =
-      SrcTy.changeElementCount(SrcTy.getElementCount().divideCoefficientBy(2));
-  // First, split the source into two smaller vectors.
-  SmallVector<Register, 2> SplitSrcs;
-  extractParts(SrcReg, MRI, MIRBuilder, SplitSrcTy, 2, SplitSrcs);
-
-  // Truncate the splits into intermediate narrower elements.
-  LLT InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2);
-  for (unsigned I = 0; I < SplitSrcs.size(); ++I)
-    SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0);
-
-  auto Concat = MIRBuilder.buildConcatVectors(
-      DstTy.changeElementSize(DstTy.getScalarSizeInBits() * 2), SplitSrcs);
-
-  Helper.Observer.changingInstr(MI);
-  MI.getOperand(1).setReg(Concat.getReg(0));
-  Helper.Observer.changedInstr(MI);
-  return true;
-}
-
 bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(
     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder,
     GISelChangeObserver &Observer) const {
@@ -1319,6 +1277,9 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
 
     return true;
   }
+  case Intrinsic::experimental_vector_reverse:
+    // TODO: Add support for vector_reverse
+    return false;
   }
 
   return true;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 853d5a2305ac68a..e6c9182da912dba 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -46,7 +46,6 @@ class AArch64LegalizerInfo : public LegalizerInfo {
   bool legalizeSmallCMGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI,
                                   MachineIRBuilder &MIRBuilder,
                                   GISelChangeObserver &Observer) const;
-  bool legalizeVectorTrunc(MachineInstr &MI, LegalizerHelper &Helper) const;
   bool legalizeBitfieldExtract(MachineInstr &MI, MachineRegisterInfo &MRI,
                                LegalizerHelper &Helper) const;
   bool legalizeRotate(MachineInstr &MI, MachineRegisterInfo &MRI,
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
new file mode 100644
index 000000000000000..16b780a83973471
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-xtn.mir
@@ -0,0 +1,531 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -O0 -mtriple=arm64-unknown-unknown -global-isel -run-pass=legalizer -global-isel-abort=2 %s -o - | FileCheck %s
+
+---
+name:            xtn_v2i64_v2i8
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: xtn_v2i64_v2i8
+    ; CHECK: liveins: $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[COPY]](<2 x s64>)
+    ; CHECK-NEXT: $d0 = COPY [[TRUNC]](<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(<2 x s8>) = G_TRUNC %0(<2 x s64>)
+    %2:_(<2 x s32>) = G_ANYEXT %1(<2 x s8>)
+    $d0 = COPY %2(<2 x s32>)
+    RET_ReallyLR implicit $d0
+
+...
+
+---
+name:            xtn_v2i128_v2i8
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $x2, $x3
+
+    ; CHECK-LABEL: name: xtn_v2i128_v2i8
+    ; CHECK: liveins: $x0, $x1, $x2, $x3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY1]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[BUILD_VECTOR]](<2 x s64>)
+    ; CHECK-NEXT: $d0 = COPY [[TRUNC]](<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %1:_(s64) = COPY $x0
+    %2:_(s64) = COPY $x1
+    %3:_(s64) = COPY $x2
+    %4:_(s64) = COPY $x3
+    %5:_(s128) = G_MERGE_VALUES %1(s64), %2(s64)
+    %6:_(s128) = G_MERGE_VALUES %3(s64), %4(s64)
+    %0:_(<2 x s128>) = G_BUILD_VECTOR %5(s128), %6(s128)
+    %7:_(<2 x s8>) = G_TRUNC %0(<2 x s128>)
+    %8:_(<2 x s32>) = G_ANYEXT %7(<2 x s8>)
+    $d0 = COPY %8(<2 x s32>)
+    RET_ReallyLR implicit $d0
+
+...
+
+---
+name:            xtn_v2i64_v2i16
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: xtn_v2i64_v2i16
+    ; CHECK: liveins: $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[COPY]](<2 x s64>)
+    ; CHECK-NEXT: $d0 = COPY [[TRUNC]](<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(<2 x s16>) = G_TRUNC %0(<2 x s64>)
+    %2:_(<2 x s32>) = G_ANYEXT %1(<2 x s16>)
+    $d0 = COPY %2(<2 x s32>)
+    RET_ReallyLR implicit $d0
+
+...
+
+---
+name:            xtn_v2i128_v2i16
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $x2, $x3
+
+    ; CHECK-LABEL: name: xtn_v2i128_v2i16
+    ; CHECK: liveins: $x0, $x1, $x2, $x3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY1]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[BUILD_VECTOR]](<2 x s64>)
+    ; CHECK-NEXT: $d0 = COPY [[TRUNC]](<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %1:_(s64) = COPY $x0
+    %2:_(s64) = COPY $x1
+    %3:_(s64) = COPY $x2
+    %4:_(s64) = COPY $x3
+    %5:_(s128) = G_MERGE_VALUES %1(s64), %2(s64)
+    %6:_(s128) = G_MERGE_VALUES %3(s64), %4(s64)
+    %0:_(<2 x s128>) = G_BUILD_VECTOR %5(s128), %6(s128)
+    %7:_(<2 x s16>) = G_TRUNC %0(<2 x s128>)
+    %8:_(<2 x s32>) = G_ANYEXT %7(<2 x s16>)
+    $d0 = COPY %8(<2 x s32>)
+    RET_ReallyLR implicit $d0
+
+...
+
+---
+name:            xtn_v2i128_v2i32
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $x2, $x3
+
+    ; CHECK-LABEL: name: xtn_v2i128_v2i32
+    ; CHECK: liveins: $x0, $x1, $x2, $x3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY1]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[BUILD_VECTOR]](<2 x s64>)
+    ; CHECK-NEXT: $d0 = COPY [[TRUNC]](<2 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %1:_(s64) = COPY $x0
+    %2:_(s64) = COPY $x1
+    %3:_(s64) = COPY $x2
+    %4:_(s64) = COPY $x3
+    %5:_(s128) = G_MERGE_VALUES %1(s64), %2(s64)
+    %6:_(s128) = G_MERGE_VALUES %3(s64), %4(s64)
+    %0:_(<2 x s128>) = G_BUILD_VECTOR %5(s128), %6(s128)
+    %7:_(<2 x s32>) = G_TRUNC %0(<2 x s128>)
+    $d0 = COPY %7(<2 x s32>)
+    RET_ReallyLR implicit $d0
+
+...
+
+---
+name:            xtn_v2i128_v2i64
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1, $x2, $x3
+
+    ; CHECK-LABEL: name: xtn_v2i128_v2i64
+    ; CHECK: liveins: $x0, $x1, $x2, $x3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY1]](s64)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; CHECK-NEXT: $q0 = COPY [[COPY2]](<2 x s64>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %1:_(s64) = COPY $x0
+    %2:_(s64) = COPY $x1
+    %3:_(s64) = COPY $x2
+    %4:_(s64) = COPY $x3
+    %5:_(s128) = G_MERGE_VALUES %1(s64), %2(s64)
+    %6:_(s128) = G_MERGE_VALUES %3(s64), %4(s64)
+    %0:_(<2 x s128>) = G_BUILD_VECTOR %5(s128), %6(s128)
+    %7:_(<2 x s64>) = G_TRUNC %0(<2 x s128>)
+    $q0 = COPY %7(<2 x s64>)
+    RET_ReallyLR implicit $q0
+
+...
+
+---
+name:            xtn_v3i16_v3i8
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: xtn_v3i16_v3i8
+    ; CHECK: liveins: $d0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+    ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32)
+    ; CHECK-NEXT: $w1 = COPY [[ANYEXT1]](s32)
+    ; CHECK-NEXT: $w2 = COPY [[ANYEXT2]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0, implicit $w1, implicit $w2
+    %1:_(<4 x s16>) = COPY $d0
+    %2:_(s16), %3:_(s16), %4:_(s16), %5:_(s16) = G_UNMERGE_VALUES %1(<4 x s16>)
+    %0:_(<3 x s16>) = G_BUILD_VECTOR %2(s16), %3(s16), %4(s16)
+    %6:_(<3 x s8>) = G_TRUNC %0(<3 x s16>)
+    %10:_(s8), %11:_(s8), %12:_(s8) = G_UNMERGE_VALUES %6(<3 x s8>)
+    %7:_(s32) = G_ANYEXT %10(s8)
+    %8:_(s32) = G_ANYEXT %11(s8)
+    %9:_(s32) = G_ANYEXT %12(s8)
+    $w0 = COPY %7(s32)
+    $w1 = COPY %8(s32)
+    $w2 = COPY %9(s32)
+    RET_ReallyLR implicit $w0, implicit $w1, implicit $w2
+
+...
+
+---
+name:            xtn_v3i32_v3i8
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: xtn_v3i32_v3i8
+    ; CHECK: liveins: $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
+    ; CHECK-NEXT: $w0 = COPY [[UV]](s32)
+    ; CHECK-NEXT: $w1 = COPY [[UV1]](s32)
+    ; CHECK-NEXT: $w2 = COPY [[UV2]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0, implicit $w1, implicit $w2
+    %1:_(<4 x s32>) = COPY $q0
+    %2:_(s32), %3:_(s32), %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %1(<4 x s32>)
+    %0:_(<3 x s32>) = G_BUILD_VECTOR %2(s32), %3(s32), %4(s32)
+    %6:_(<3 x s8>) = G_TRUNC %0(<3 x s32>)
+    %10:_(s8), %11:_(s8), %12:_(s8) = G_UNMERGE_VALUES %6(<3 x s8>)
+    %7:_(s32) = G_ANYEXT %10(s8)
+    %8:_(s32) = G_ANYEXT %11(s8)
+    %9:_(s32) = G_ANYEXT %12(s8)
+    $w0 = COPY %7(s32)
+    $w1 = COPY %8(s32)
+    $w2 = COPY %9(s32)
+    RET_ReallyLR implicit $w0, implicit $w1, implicit $w2
+
+...
+
+---
+name:            xtn_v3i64_v3i8
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $d0, $d1, $d2
+
+    ; CHECK-LABEL: name: xtn_v3i64_v3i8
+    ; CHECK: liveins: $d0, $d1, $d2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $d1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $d2
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+    ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32)
+    ; CHECK-NEXT: $w1 = COPY [[TRUNC1]](s32)
+    ; CHECK-NEXT: $w2 = COPY [[TRUNC2]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0, implicit $w1, implicit $w2
+    %1:_(s64) = COPY $d0
+    %2:_(s64) = COPY $d1
+    %3:_(s64) = COPY $d2
+    %0:_(<3 x s64>) = G_BUILD_VECTOR %1(s64), %2(s64), %3(s64)
+    %4:_(<3 x s8>) = G_TRUNC %0(<3 x s64>)
+    %8:_(s8), %9:_(s8), %10:_(s8) = G_UNMERGE_VALUES %4(<3 x s8>)
+    %5:_(s32) = G_ANYEXT %8(s8)
+    %6:_(s32) = G_ANYEXT %9(s8)
+    %7:_(s32) = G_ANYEXT %10(s8)
+    $w0 = COPY %5(s32)
+    $w1 = COPY %6(s32)
+    $w2 = COPY %7(s32)
+    RET_ReallyLR implicit $w0, implicit $w1, implicit $w2
+
+...
+
+---
+name:            xtn_v3i64_v3i16
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $d0, $d1, $d2
+
+    ; CHECK-LABEL: name: xtn_v3i64_v3i16
+    ; CHECK: liveins: $d0, $d1, $d2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $d1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $d2
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16), [[TRUNC2]](s16), [[DEF]](s16)
+    ; CHECK-NEXT: $d0 = COPY [[BUILD_VECTOR]](<4 x s16>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %1:_(s64) = COPY $d0
+    %2:_(s64) = COPY $d1
+    %3:_(s64) = COPY $d2
+    %0:_(<3 x s64>) = G_BUILD_VECTOR %1(s64), %2(s64), %3(s64)
+    %4:_(<3 x s16>) = G_TRUNC %0(<3 x s64>)
+    %5:_(s16), %6:_(s16), %7:_(s16) = G_UNMERGE_VALUES %4(<3 x s16>)
+    %8:_(s16) = G_IMPLICIT_DEF
+    %9:_(<4 x s16>) = G_BUILD_VECTOR %5(s16), %6(s16), %7(s16), %8(s16)
+    $d0 = COPY %9(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
+
+---
+name:            xtn_v3i64_v3i32
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $d0, $d1, $d2
+
+    ; CHECK-LABEL: name: xtn_v3i64_v3i32
+    ; CHECK: liveins: $d0, $d1, $d2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $d1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $d2
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[TRUNC]](s32), [[TRUNC1]](s32), [[TRUNC2]](s32), [[DEF]](s32)
+    ; CHECK-NEXT: $q0 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %1:_(s64) = COPY $d0
+    %2:_(s64) = COPY $d1
+    %3:_(s64) = COPY $d2
+    %0:_(<3 x s64>) = G_BUILD_VECTOR %1(s64), %2(s64), %3(s64)
+    %4:_(<3 x s32>) = G_TRUNC %0(<3 x s64>)
+    %5:_(s32), %6:_(s32), %7:_(s32) = G_UNMERGE_VALUES %4(<3 x s32>)
+    %8:_(s32) = G_IMPLICIT_DEF
+    %9:_(<4 x s32>) = G_BUILD_VECTOR %5(s32), %6(s32), %7(s32), %8(s32)
+    $q0 = COPY %9(<4 x s32>)
+    RET_ReallyLR implicit $q0
+
+...
+
+---
+name:            xtn_v4i32_v4i8
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: xtn_v4i32_v4i8
+    ; CHECK: liveins: $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[COPY]](<4 x s32>)
+    ; CHECK-NEXT: $d0 = COPY [[TRUNC]](<4 x s16>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(<4 x s8>) = G_TRUNC %0(<4 x s32>)
+    %2:_(<4 x s16>) = G_ANYEXT %1(<4 x s8>)
+    $d0 = COPY %2(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
+
+---
+name:            xtn_v4i64_v4i8
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: xtn_v4i64_v4i8
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[COPY]](<2 x s64>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[COPY1]](<2 x s64>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[TRUNC]](<2 x s32>), [[TRUNC1]](<2 x s32>)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s32>)
+    ; CHECK-NEXT: $d0 = COPY [[TRUNC2]](<4 x s16>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %1:_(<2 x s64>) = COPY $q0
+    %2:_(<2 x s64>) = COPY $q1
+    %0:_(<4 x s64>) = G_CONCAT_VECTORS %1(<2 x s64>), %2(<2 x s64>)
+    %3:_(<4 x s8>) = G_TRUNC %0(<4 x s64>)
+    %4:_(<4 x s16>) = G_ANYEXT %3(<4 x s8>)
+    $d0 = COPY %4(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
+
+---
+name:            xtn_v4i64_v4i16
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: xtn_v4i64_v4i16
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[COPY]](<2 x s64>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[COPY1]](<2 x s64>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[TRUNC]](<2 x s32>), [[TRUNC1]](<2 x s32>)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s32>)
+    ; CHECK-NEXT: $d0 = COPY [[TRUNC2]](<4 x s16>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %1:_(<2 x s64>) = COPY $q0
+    %2:_(<2 x s64>) = COPY $q1
+    %0:_(<4 x s64>) = G_CONCAT_VECTORS %1(<2 x s64>), %2(<2 x s64>)
+    %3:_(<4 x s16>) = G_TRUNC %0(<4 x s64>)
+    $d0 = COPY %3(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
+
+---
+name:            xtn_v4i64_v4i32
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: xtn_v4i64_v4i32
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[COPY]](<2 x s64>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<2 x s32>) = G_TRUNC [[COPY1]](<2 x s64>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[TRUNC]](<2 x s32>), [[TRUNC1]](<2 x s32>)
+    ; CHECK-NEXT: $q0 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %1:_(<2 x s64>) = COPY $q0
+    %2:_(<2 x s64>) = COPY $q1
+    %0:_(<4 x s64>) = G_CONCAT_VECTORS %1(<2 x s64>), %2(<2 x s64>)
+    %3:_(<4 x s32>) = G_TRUNC %0(<4 x s64>)
+    $q0 = COPY %3(<4 x s32>)
+    RET_ReallyLR implicit $q0
+
+...
+
+---
+name:            xtn_v8i32_v8i8
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: xtn_v8i32_v8i8
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[COPY]](<4 x s32>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[COPY1]](<4 x s32>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s16>), [[TRUNC1]](<4 x s16>)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<8 x s16>)
+    ; CHECK-NEXT: $d0 = COPY [[TRUNC2]](<8 x s8>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $d0
+    %1:_(<4 x s32>) = COPY $q0
+    %2:_(<4 x s32>) = COPY $q1
+    %0:_(<8 x s32>) = G_CONCAT_VECTORS %1(<4 x s32>), %2(<4 x s32>)
+    %3:_(<8 x s8>) = G_TRUNC %0(<8 x s32>)
+    $d0 = COPY %3(<8 x s8>)
+    RET_ReallyLR implicit $d0
+
+...
+
+---
+name:            xtn_v8i32_v8i16
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: xtn_v8i32_v8i16
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[COPY]](<4 x s32>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[COPY1]](<4 x s32>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s16>), [[TRUNC1]](<4 x s16>)
+    ; CHECK-NEXT: $q0 = COPY [[CONCAT_VECTORS]](<8 x s16>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %1:_(<4 x s32>) = COPY $q0
+    %2:_(<4 x s32>) = COPY $q1
+    %0:_(<8 x s32>) = G_CONCAT_VECTORS %1(<4 x s32>), %2(<4 x s32>)
+    %3:_(<8 x s16>) = G_TRUNC %0(<8 x s32>)
+    $q0 = COPY %3(<8 x s16>)
+    RET_ReallyLR implicit $q0
+
+...
+
+---
+name:            xtn_v16i16_v16i8
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: xtn_v16i16_v16i8
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $q1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[COPY]](<8 x s16>)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[COPY1]](<8 x s16>)
+    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<8 x s8>), [[TRUNC1]](<8 x s8>)
+    ; CHECK-NEXT: $q0 = COPY [[CONCAT_VECTORS]](<16 x s8>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %1:_(<8 x s16>) = COPY $q0
+    %2:_(<8 x s16>) = COPY $q1
+    %0:_(<16 x s16>) = G_CONCAT_VECTORS %1(<8 x s16>), %2(<8 x s16>)
+    %3:_(<16 x s8>) = G_TRUNC %0(<16 x s16>)
+    $q0 = COPY %3(<16 x s8>)
+    RET_ReallyLR implicit $q0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/xtn.ll b/llvm/test/CodeGen/AArch64/xtn.ll
new file mode 100644
index 000000000000000..0dd4e3644b78356
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/xtn.ll
@@ -0,0 +1,473 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+define i8 @xtn_i16_to_i8(i16 %a) {
+; CHECK-LABEL: xtn_i16_to_i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc i16 %a to i8
+  ret i8 %arg1
+}
+
+define i8 @xtn_i32_to_i8(i32 %a) {
+; CHECK-LABEL: xtn_i32_to_i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc i32 %a to i8
+  ret i8 %arg1
+}
+
+define i8 @xtn_i64_to_i8(i64 %a) {
+; CHECK-LABEL: xtn_i64_to_i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc i64 %a to i8
+  ret i8 %arg1
+}
+
+define i8 @xtn_i128_to_i8(i128 %a) {
+; CHECK-LABEL: xtn_i128_to_i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc i128 %a to i8
+  ret i8 %arg1
+}
+
+define i16 @xtn_i32_to_i16(i32 %a) {
+; CHECK-LABEL: xtn_i32_to_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc i32 %a to i16
+  ret i16 %arg1
+}
+
+define i16 @xtn_i64_to_i16(i64 %a) {
+; CHECK-LABEL: xtn_i64_to_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc i64 %a to i16
+  ret i16 %arg1
+}
+
+define i16 @xtn_i128_to_i16(i128 %a) {
+; CHECK-LABEL: xtn_i128_to_i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc i128 %a to i16
+  ret i16 %arg1
+}
+
+define i32 @xtn_i64_to_i32(i64 %a) {
+; CHECK-LABEL: xtn_i64_to_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc i64 %a to i32
+  ret i32 %arg1
+}
+
+define i32 @xtn_i128_to_i32(i128 %a) {
+; CHECK-LABEL: xtn_i128_to_i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc i128 %a to i32
+  ret i32 %arg1
+}
+
+define i64 @xtn_i128_to_i64(i128 %a) {
+; CHECK-LABEL: xtn_i128_to_i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc i128 %a to i64
+  ret i64 %arg1
+}
+
+define <2 x i8> @xtn_v2i16_v2i8(<2 x i16> %a) {
+; CHECK-LABEL: xtn_v2i16_v2i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc <2 x i16> %a to <2 x i8>
+  ret <2 x i8> %arg1
+}
+
+define <2 x i8> @xtn_v2i32_v2i8(<2 x i32> %a) {
+; CHECK-LABEL: xtn_v2i32_v2i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc <2 x i32> %a to <2 x i8>
+  ret <2 x i8> %arg1
+}
+
+define <2 x i8> @xtn_v2i64_v2i8(<2 x i64> %a) {
+; CHECK-LABEL: xtn_v2i64_v2i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc <2 x i64> %a to <2 x i8>
+  ret <2 x i8> %arg1
+}
+
+define <2 x i8> @xtn_v2i128_v2i8(<2 x i128> %a) {
+; CHECK-SD-LABEL: xtn_v2i128_v2i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov v0.s[1], w2
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xtn_v2i128_v2i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov d0, x0
+; CHECK-GI-NEXT:    mov v0.d[1], x2
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %arg1 = trunc <2 x i128> %a to <2 x i8>
+  ret <2 x i8> %arg1
+}
+
+define <2 x i16> @xtn_v2i32_v2i16(<2 x i32> %a) {
+; CHECK-LABEL: xtn_v2i32_v2i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc <2 x i32> %a to <2 x i16>
+  ret <2 x i16> %arg1
+}
+
+define <2 x i16> @xtn_v2i64_v2i16(<2 x i64> %a) {
+; CHECK-LABEL: xtn_v2i64_v2i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc <2 x i64> %a to <2 x i16>
+  ret <2 x i16> %arg1
+}
+
+define <2 x i16> @xtn_v2i128_v2i16(<2 x i128> %a) {
+; CHECK-SD-LABEL: xtn_v2i128_v2i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov v0.s[1], w2
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xtn_v2i128_v2i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov d0, x0
+; CHECK-GI-NEXT:    mov v0.d[1], x2
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %arg1 = trunc <2 x i128> %a to <2 x i16>
+  ret <2 x i16> %arg1
+}
+
+define <2 x i32> @xtn_v2i64_v2i32(<2 x i64> %a) {
+; CHECK-LABEL: xtn_v2i64_v2i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc <2 x i64> %a to <2 x i32>
+  ret <2 x i32> %arg1
+}
+
+define <2 x i32> @xtn_v2i128_v2i32(<2 x i128> %a) {
+; CHECK-SD-LABEL: xtn_v2i128_v2i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov v0.s[1], w2
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xtn_v2i128_v2i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov d0, x0
+; CHECK-GI-NEXT:    mov v0.d[1], x2
+; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
+; CHECK-GI-NEXT:    ret
+entry:
+  %arg1 = trunc <2 x i128> %a to <2 x i32>
+  ret <2 x i32> %arg1
+}
+
+define <2 x i64> @xtn_v2i128_v2i64(<2 x i128> %a) {
+; CHECK-LABEL: xtn_v2i128_v2i64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    mov v0.d[1], x2
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc <2 x i128> %a to <2 x i64>
+  ret <2 x i64> %arg1
+}
+
+define <3 x i8> @xtn_v3i16_v3i8(<3 x i16> %a) {
+; CHECK-SD-LABEL: xtn_v3i16_v3i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    umov w0, v0.h[0]
+; CHECK-SD-NEXT:    umov w1, v0.h[1]
+; CHECK-SD-NEXT:    umov w2, v0.h[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xtn_v3i16_v3i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w1, s1
+; CHECK-GI-NEXT:    fmov w2, s2
+; CHECK-GI-NEXT:    ret
+entry:
+  %arg1 = trunc <3 x i16> %a to <3 x i8>
+  ret <3 x i8> %arg1
+}
+
+define <3 x i8> @xtn_v3i32_v3i8(<3 x i32> %a) {
+; CHECK-SD-LABEL: xtn_v3i32_v3i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    umov w0, v0.h[0]
+; CHECK-SD-NEXT:    umov w1, v0.h[1]
+; CHECK-SD-NEXT:    umov w2, v0.h[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xtn_v3i32_v3i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    fmov w1, s1
+; CHECK-GI-NEXT:    fmov w2, s2
+; CHECK-GI-NEXT:    ret
+entry:
+  %arg1 = trunc <3 x i32> %a to <3 x i8>
+  ret <3 x i8> %arg1
+}
+
+define <3 x i8> @xtn_v3i64_v3i8(<3 x i64> %a) {
+; CHECK-SD-LABEL: xtn_v3i64_v3i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    xtn v1.2s, v2.2d
+; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SD-NEXT:    fmov w2, s1
+; CHECK-SD-NEXT:    mov w1, v0.s[1]
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xtn_v3i64_v3i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x0, d0
+; CHECK-GI-NEXT:    fmov x1, d1
+; CHECK-GI-NEXT:    fmov x2, d2
+; CHECK-GI-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 killed $x1
+; CHECK-GI-NEXT:    // kill: def $w2 killed $w2 killed $x2
+; CHECK-GI-NEXT:    ret
+entry:
+  %arg1 = trunc <3 x i64> %a to <3 x i8>
+  ret <3 x i8> %arg1
+}
+
+define <3 x i16> @xtn_v3i32_v3i16(<3 x i32> %a) {
+; CHECK-SD-LABEL: xtn_v3i32_v3i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xtn_v3i32_v3i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[2]
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %arg1 = trunc <3 x i32> %a to <3 x i16>
+  ret <3 x i16> %arg1
+}
+
+define <3 x i16> @xtn_v3i64_v3i16(<3 x i64> %a) {
+; CHECK-SD-LABEL: xtn_v3i64_v3i16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xtn_v3i64_v3i16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fmov s1, w9
+; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    mov v0.h[1], v1.h[0]
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.h[3], v0.h[0]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+entry:
+  %arg1 = trunc <3 x i64> %a to <3 x i16>
+  ret <3 x i16> %arg1
+}
+
+define <3 x i32> @xtn_v3i64_v3i32(<3 x i64> %a) {
+; CHECK-SD-LABEL: xtn_v3i64_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: xtn_v3i64_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    mov v0.s[1], w9
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NEXT:    ret
+entry:
+  %arg1 = trunc <3 x i64> %a to <3 x i32>
+  ret <3 x i32> %arg1
+}
+
+define <4 x i8> @xtn_v4i16_v4i8(<4 x i16> %a) {
+; CHECK-LABEL: xtn_v4i16_v4i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc <4 x i16> %a to <4 x i8>
+  ret <4 x i8> %arg1
+}
+
+define <4 x i8> @xtn_v4i32_v4i8(<4 x i32> %a) {
+; CHECK-LABEL: xtn_v4i32_v4i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc <4 x i32> %a to <4 x i8>
+  ret <4 x i8> %arg1
+}
+
+define <4 x i8> @xtn_v4i64_v4i8(<4 x i64> %a) {
+; CHECK-LABEL: xtn_v4i64_v4i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc <4 x i64> %a to <4 x i8>
+  ret <4 x i8> %arg1
+}
+
+define <4 x i16> @xtn_v4i32_v4i16(<4 x i32> %a) {
+; CHECK-LABEL: xtn_v4i32_v4i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc <4 x i32> %a to <4 x i16>
+  ret <4 x i16> %arg1
+}
+
+define <4 x i16> @xtn_v4i64_v4i16(<4 x i64> %a) {
+; CHECK-LABEL: xtn_v4i64_v4i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc <4 x i64> %a to <4 x i16>
+  ret <4 x i16> %arg1
+}
+
+define <4 x i32> @xtn_v4i64_v4i32(<4 x i64> %a) {
+; CHECK-LABEL: xtn_v4i64_v4i32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc <4 x i64> %a to <4 x i32>
+  ret <4 x i32> %arg1
+}
+
+define <8 x i8> @xtn_v8i16_v8i8(<8 x i16> %a) {
+; CHECK-LABEL: xtn_v8i16_v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc <8 x i16> %a to <8 x i8>
+  ret <8 x i8> %arg1
+}
+
+define <8 x i8> @xtn_v8i32_v8i8(<8 x i32> %a) {
+; CHECK-LABEL: xtn_v8i32_v8i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc <8 x i32> %a to <8 x i8>
+  ret <8 x i8> %arg1
+}
+
+define <8 x i16> @xtn_v8i32_v8i16(<8 x i32> %a) {
+; CHECK-LABEL: xtn_v8i32_v8i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc <8 x i32> %a to <8 x i16>
+  ret <8 x i16> %arg1
+}
+
+define <16 x i8> @xtn_v16i16_v16i8(<16 x i16> %a) {
+; CHECK-LABEL: xtn_v16i16_v16i8:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+entry:
+  %arg1 = trunc <16 x i16> %a to <16 x i8>
+  ret <16 x i8> %arg1
+}
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index 8b30ee257ad2055..8573a8a2d2571d6 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -2,8 +2,6 @@
 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for zext_v16i10_v16i16
-
 define i16 @zext_i8_to_i16(i8 %a) {
 ; CHECK-LABEL: zext_i8_to_i16:
 ; CHECK:       // %bb.0: // %entry
@@ -242,19 +240,18 @@ define <3 x i16> @zext_v3i8_v3i16(<3 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i8_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    mov w8, #255 // =0xff
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    fmov s2, w1
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w2
-; CHECK-GI-NEXT:    mov v3.16b, v0.16b
-; CHECK-GI-NEXT:    mov v3.h[1], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v3.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NEXT:    mov v3.h[3], v0.h[0]
-; CHECK-GI-NEXT:    and v0.8b, v1.8b, v3.8b
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    mov v2.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[2], w2
+; CHECK-GI-NEXT:    mov v2.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NEXT:    mov v2.h[2], v1.h[0]
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov v2.h[3], v0.h[0]
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <3 x i8> %a to <3 x i16>
@@ -425,19 +422,18 @@ define <3 x i16> @zext_v3i10_v3i16(<3 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v3i10_v3i16:
 ; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov s0, w0
 ; CHECK-GI-NEXT:    mov w8, #1023 // =0x3ff
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    fmov s2, w1
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v1.h[1], v2.h[0]
-; CHECK-GI-NEXT:    fmov s2, w2
-; CHECK-GI-NEXT:    mov v3.16b, v0.16b
-; CHECK-GI-NEXT:    mov v3.h[1], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[2], v2.h[0]
-; CHECK-GI-NEXT:    mov v3.h[2], v0.h[0]
-; CHECK-GI-NEXT:    mov v1.h[3], v0.h[0]
-; CHECK-GI-NEXT:    mov v3.h[3], v0.h[0]
-; CHECK-GI-NEXT:    and v0.8b, v1.8b, v3.8b
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    mov v2.16b, v1.16b
+; CHECK-GI-NEXT:    mov v0.s[2], w2
+; CHECK-GI-NEXT:    mov v2.h[1], v1.h[0]
+; CHECK-GI-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NEXT:    mov v2.h[2], v1.h[0]
+; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
+; CHECK-GI-NEXT:    mov v2.h[3], v0.h[0]
+; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <3 x i10> %a to <3 x i16>

>From 601ae249dfc1f324a273e700843edefe3432d7fe Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan at arm.com>
Date: Wed, 11 Oct 2023 14:50:06 +0000
Subject: [PATCH 08/38] [Flang][OpenMP] NFC: Port a few parallel tests to HLFIR
 flow

---
 .../OpenMP/parallel-private-clause-fixes.f90  |  92 ++++++++++++++++
 .../Lower/OpenMP/parallel-reduction-add.f90   | 101 ++++++++++++++++++
 flang/test/Lower/OpenMP/parallel-sections.f90 |  58 ++++++++++
 .../OpenMP/parallel-wsloop-firstpriv.f90      |  69 ++++++++++++
 4 files changed, 320 insertions(+)
 create mode 100644 flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
 create mode 100644 flang/test/Lower/OpenMP/parallel-reduction-add.f90
 create mode 100644 flang/test/Lower/OpenMP/parallel-sections.f90
 create mode 100644 flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90

diff --git a/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90 b/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
new file mode 100644
index 000000000000000..9334b99f1e9d28d
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-private-clause-fixes.f90
@@ -0,0 +1,92 @@
+! This test checks a few bug fixes in the PRIVATE clause lowering
+
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+! CHECK-LABEL: multiple_private_fix
+! CHECK-SAME:  %[[GAMA:.*]]: !fir.ref<i32> {fir.bindc_name = "gama"}
+! CHECK:         %[[GAMA_DECL:.*]]:2 = hlfir.declare %[[GAMA]] {uniq_name = "_QFmultiple_private_fixEgama"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_private_fixEi"}
+! CHECK:         %[[I_DECL:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmultiple_private_fixEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFmultiple_private_fixEj"}
+! CHECK:         %[[J_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_private_fixEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_private_fixEx"}
+! CHECK:         %[[X_DECL:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFmultiple_private_fixEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:         omp.parallel {
+! CHECK:           %[[PRIV_J:.*]] = fir.alloca i32 {bindc_name = "j", pinned
+! CHECK:           %[[PRIV_J_DECL:.*]]:2 = hlfir.declare %[[PRIV_J]] {uniq_name = "_QFmultiple_private_fixEj"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[PRIV_I:.*]] = fir.alloca i32 {adapt.valuebyref, pinned
+! CHECK:           %[[PRIV_I_DECL:.*]]:2 = hlfir.declare %[[PRIV_I]] {uniq_name = "_QFmultiple_private_fixEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned
+! CHECK:           %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFmultiple_private_fixEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[ONE:.*]] = arith.constant 1 : i32
+! CHECK:           %[[VAL_3:.*]] = fir.load %[[GAMA_DECL]]#0 : !fir.ref<i32>
+! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i32
+! CHECK:           omp.wsloop for (%[[VAL_6:.*]]) : i32 = (%[[ONE]]) to (%[[VAL_3]]) inclusive step (%[[VAL_5]]) {
+! CHECK:             fir.store %[[VAL_6]] to %[[PRIV_I_DECL]]#1 : !fir.ref<i32>
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i32) -> index
+! CHECK:             %[[VAL_9:.*]] = fir.load %[[GAMA_DECL]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : index
+! CHECK:             %[[LB:.*]] = fir.convert %[[VAL_8]] : (index) -> i32
+! CHECK:             %[[VAL_12:.*]]:2 = fir.do_loop %[[VAL_13:[^ ]*]] =
+! CHECK-SAME:            %[[VAL_8]] to %[[VAL_10]] step %[[VAL_11]]
+! CHECK-SAME:            iter_args(%[[IV:.*]] = %[[LB]]) -> (index, i32) {
+! CHECK:               fir.store %[[IV]] to %[[PRIV_J_DECL]]#1 : !fir.ref<i32>
+! CHECK:               %[[LOAD:.*]] = fir.load %[[PRIV_I_DECL]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[PRIV_J_DECL]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_16:.*]] = arith.addi %[[LOAD]], %[[VAL_15]] : i32
+! CHECK:               hlfir.assign %[[VAL_16]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]] = arith.addi %[[VAL_13]], %[[VAL_11]] : index
+! CHECK:               %[[STEPCAST:.*]] = fir.convert %[[VAL_11]] : (index) -> i32
+! CHECK:               %[[IVLOAD:.*]] = fir.load %[[PRIV_J_DECL]]#1 : !fir.ref<i32>
+! CHECK:               %[[IVINC:.*]] = arith.addi %[[IVLOAD]], %[[STEPCAST]]
+! CHECK:               fir.result %[[VAL_17]], %[[IVINC]] : index, i32
+! CHECK:             }
+! CHECK:             fir.store %[[VAL_12]]#1 to %[[PRIV_J_DECL]]#1 : !fir.ref<i32>
+! CHECK:             omp.yield
+! CHECK:           }
+! CHECK:           omp.terminator
+! CHECK:         }
+! CHECK:         return
+subroutine multiple_private_fix(gama)
+        integer :: i, j, x, gama
+!$OMP PARALLEL DO PRIVATE(j,x)
+        do i = 1, gama
+          do j = 1, gama
+            x = i + j
+          end do
+        end do
+!$OMP END PARALLEL DO
+end subroutine
+
+! CHECK-LABEL: multiple_private_fix2
+! CHECK:  %[[X1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_private_fix2Ex"}
+! CHECK:  %[[X1_DECL:.*]]:2 = hlfir.declare %[[X1]] {uniq_name = "_QFmultiple_private_fix2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:  omp.parallel  {
+! CHECK:    %[[X2:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFmultiple_private_fix2Ex"}
+! CHECK:    %[[X2_DECL:.*]]:2 = hlfir.declare %[[X2]] {uniq_name = "_QFmultiple_private_fix2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:    omp.parallel  {
+! CHECK:      %[[X3:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFmultiple_private_fix2Ex"}
+! CHECK:      %[[X3_DECL:.*]]:2 = hlfir.declare %[[X3]] {uniq_name = "_QFmultiple_private_fix2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:      %[[C3:.*]] = arith.constant 1 : i32
+! CHECK:      hlfir.assign %[[C3]] to %[[X3_DECL]]#0 : i32, !fir.ref<i32>
+! CHECK:      omp.terminator
+! CHECK:    }
+! CHECK:      %[[C2:.*]] = arith.constant 1 : i32
+! CHECK:      hlfir.assign %[[C2]] to %[[X2_DECL]]#0 : i32, !fir.ref<i32>
+! CHECK:    omp.terminator
+! CHECK:  }
+! CHECK:      %[[C1:.*]] = arith.constant 1 : i32
+! CHECK:      hlfir.assign %[[C1]] to %[[X1_DECL]]#0 : i32, !fir.ref<i32>
+! CHECK:  return
+subroutine multiple_private_fix2()
+   integer :: x
+   !$omp parallel private(x)
+   !$omp parallel private(x)
+      x = 1
+   !$omp end parallel
+      x = 1
+   !$omp end parallel
+      x = 1
+end subroutine
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-add.f90 b/flang/test/Lower/OpenMP/parallel-reduction-add.f90
new file mode 100644
index 000000000000000..81a93aebbd26619
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-reduction-add.f90
@@ -0,0 +1,101 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_F32_NAME:.*]] : f32 init {
+!CHECK: ^bb0(%{{.*}}: f32):
+!CHECK:  %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  omp.yield(%[[C0_1]] : f32)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32):
+!CHECK:  %[[RES:.*]] = arith.addf %[[ARG0]], %[[ARG1]] {{.*}}: f32
+!CHECK:  omp.yield(%[[RES]] : f32)
+!CHECK: }
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_I32_NAME:.*]] : i32 init {
+!CHECK: ^bb0(%{{.*}}: i32):
+!CHECK:  %[[C0_1:.*]] = arith.constant 0 : i32
+!CHECK:  omp.yield(%[[C0_1]] : i32)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32):
+!CHECK:  %[[RES:.*]] = arith.addi %[[ARG0]], %[[ARG1]] : i32
+!CHECK:  omp.yield(%[[RES]] : i32)
+!CHECK: }
+
+!CHECK-LABEL: func.func @_QPsimple_int_add
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_addEi"}
+!CHECK:  %[[I_DECL:.*]]:2 = hlfir.declare %[[IREF]] {uniq_name = "_QFsimple_int_addEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
+!CHECK:  hlfir.assign %[[I_START]] to %[[I_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:  omp.parallel   reduction(@[[RED_I32_NAME]] -> %[[I_DECL]]#0 : !fir.ref<i32>) {
+!CHECK:    %[[I_INCR:.*]] = arith.constant 1 : i32
+!CHECK:    omp.reduction %[[I_INCR]], %[[I_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:    omp.terminator
+!CHECK:  }
+!CHECK: return
+subroutine simple_int_add
+    integer :: i
+    i = 0
+
+    !$omp parallel reduction(+:i)
+    i = i + 1
+    !$omp end parallel
+
+    print *, i
+end subroutine
+
+!CHECK-LABEL: func.func @_QPsimple_real_add
+!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFsimple_real_addEr"}
+!CHECK:  %[[R_DECL:.*]]:2 = hlfir.declare %[[RREF]] {uniq_name = "_QFsimple_real_addEr"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  hlfir.assign %[[R_START]] to %[[R_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:  omp.parallel   reduction(@[[RED_F32_NAME]] -> %[[R_DECL]]#0 : !fir.ref<f32>) {
+!CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
+!CHECK:    omp.reduction %[[R_INCR]], %[[R_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:    omp.terminator
+!CHECK:  }
+!CHECK: return
+subroutine simple_real_add
+    real :: r
+    r = 0.0
+
+    !$omp parallel reduction(+:r)
+    r = r + 1.5
+    !$omp end parallel
+
+    print *, r
+end subroutine
+
+!CHECK-LABEL: func.func @_QPint_real_add
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFint_real_addEi"}
+!CHECK:  %[[I_DECL:.*]]:2 = hlfir.declare %[[IREF]] {uniq_name = "_QFint_real_addEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFint_real_addEr"}
+!CHECK:  %[[R_DECL:.*]]:2 = hlfir.declare %[[RREF]] {uniq_name = "_QFint_real_addEr"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  hlfir.assign %[[R_START]] to %[[R_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
+!CHECK:  hlfir.assign %[[I_START]] to %[[I_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:  omp.parallel   reduction(@[[RED_I32_NAME]] -> %[[I_DECL]]#0 : !fir.ref<i32>, @[[RED_F32_NAME]] -> %[[R_DECL]]#0 : !fir.ref<f32>) {
+!CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
+!CHECK:    omp.reduction %[[R_INCR]], %[[R_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:    %[[I_INCR:.*]] = arith.constant 3 : i32
+!CHECK:    omp.reduction %[[I_INCR]], %[[I_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:    omp.terminator
+!CHECK:  }
+!CHECK: return
+subroutine int_real_add
+    real :: r
+    integer :: i
+
+    r = 0.0
+    i = 0
+
+    !$omp parallel reduction(+:i,r)
+    r = 1.5 + r
+    i = i + 3
+    !$omp end parallel
+
+    print *, r
+    print *, i
+end subroutine
diff --git a/flang/test/Lower/OpenMP/parallel-sections.f90 b/flang/test/Lower/OpenMP/parallel-sections.f90
new file mode 100644
index 000000000000000..77139e40ed8c0e3
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-sections.f90
@@ -0,0 +1,58 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!===============================================================================
+! Parallel sections construct
+!===============================================================================
+
+!CHECK: func @_QPomp_parallel_sections
+subroutine omp_parallel_sections(x, y)
+  integer, intent(inout) :: x, y
+  !CHECK: omp.parallel {
+  !CHECK: omp.sections {
+  !$omp parallel sections
+    !CHECK: omp.section {
+    !$omp section
+      !CHECK: fir.load
+      !CHECK: arith.addi
+      !CHECK: hlfir.assign
+      x = x + 12
+      !CHECK: omp.terminator
+    !CHECK: omp.section {
+    !$omp section
+      !CHECK: fir.load
+      !CHECK: arith.subi
+      !CHECK: hlfir.assign
+      y = y - 5
+      !CHECK: omp.terminator
+  !CHECK: omp.terminator
+  !CHECK: omp.terminator
+  !$omp end parallel sections
+end subroutine omp_parallel_sections
+
+!===============================================================================
+! Parallel sections construct with allocate clause
+!===============================================================================
+
+!CHECK: func @_QPomp_parallel_sections
+subroutine omp_parallel_sections_allocate(x, y)
+  use omp_lib
+  integer, intent(inout) :: x, y
+  !CHECK: %[[allocator_1:.*]] = arith.constant 1 : i32
+  !CHECK: %[[allocator_2:.*]] = arith.constant 1 : i32
+  !CHECK: omp.parallel allocate(
+  !CHECK: %[[allocator_2]] : i32 -> %{{.*}} : !fir.ref<i32>) {
+  !CHECK: omp.sections allocate(
+  !CHECK: %[[allocator_1]] : i32 -> %{{.*}} : !fir.ref<i32>) {
+  !$omp parallel sections allocate(omp_high_bw_mem_alloc: x)
+    !CHECK: omp.section {
+    !$omp section
+      x = x + 12
+      !CHECK: omp.terminator
+    !CHECK: omp.section {
+    !$omp section
+      y = y + 5
+      !CHECK: omp.terminator
+  !CHECK: omp.terminator
+  !CHECK: omp.terminator
+  !$omp end parallel sections
+end subroutine omp_parallel_sections_allocate
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90
new file mode 100644
index 000000000000000..4ab8f78556c3c28
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-wsloop-firstpriv.f90
@@ -0,0 +1,69 @@
+! This test checks lowering of OpenMP parallel DO, with the loop bound being
+! a firstprivate variable
+
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+! CHECK: func @_QPomp_do_firstprivate(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}) 
+subroutine omp_do_firstprivate(a)
+  ! CHECK: %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFomp_do_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer::a
+  integer::n
+  n = a+1
+  !$omp parallel do firstprivate(a)
+  ! CHECK:  omp.parallel {
+  ! CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+  ! CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_firstprivateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_do_firstprivateEa"}
+  ! CHECK: %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK-NEXT: %[[LD:.*]] = fir.load %[[ARG0_DECL]]#1 : !fir.ref<i32>
+  ! CHECK-NEXT: fir.store %[[LD]] to %[[A_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
+  ! CHECK-NEXT: %[[UB:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
+  ! CHECK-NEXT: omp.wsloop   for  (%[[ARG1:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]])
+  ! CHECK-NEXT: fir.store %[[ARG1]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK-NEXT: fir.call @_QPfoo(%[[I_PVT_DECL]]#1, %[[A_PVT_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
+  ! CHECK-NEXT: omp.yield
+    do i=1, a
+      call foo(i, a)
+    end do
+  !$omp end parallel do
+  !CHECK: fir.call @_QPbar(%[[ARG0_DECL]]#1) {{.*}}: (!fir.ref<i32>) -> ()
+  call bar(a)
+end subroutine omp_do_firstprivate
+
+! CHECK: func @_QPomp_do_firstprivate2(%[[ARG0:.*]]: !fir.ref<i32> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) 
+subroutine omp_do_firstprivate2(a, n)
+  ! CHECK:  %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {uniq_name = "_QFomp_do_firstprivate2Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:  %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFomp_do_firstprivate2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer::a
+  integer::n
+  n = a+1
+  !$omp parallel do firstprivate(a, n)
+  ! CHECK:  omp.parallel {
+  ! CHECK: %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+  ! CHECK: %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFomp_do_firstprivate2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned
+  ! CHECK: %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFomp_do_firstprivate2Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[LD:.*]] = fir.load %[[ARG0_DECL]]#1 : !fir.ref<i32>
+  ! CHECK: fir.store %[[LD]] to %[[A_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK: %[[N_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "n", pinned, uniq_name = "_QFomp_do_firstprivate2En"}
+  ! CHECK: %[[N_PVT_DECL:.*]]:2 = hlfir.declare %[[N_PVT_REF]] {uniq_name = "_QFomp_do_firstprivate2En"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK: %[[LD1:.*]] = fir.load %[[ARG1_DECL]]#1 : !fir.ref<i32>
+  ! CHECK: fir.store %[[LD1]] to %[[N_PVT_DECL]]#1 : !fir.ref<i32>
+
+
+  ! CHECK: %[[LB:.*]] = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK: %[[UB:.*]] = fir.load %[[N_PVT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
+  ! CHECK: omp.wsloop   for  (%[[ARG2:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]])
+  ! CHECK: fir.store %[[ARG2]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
+  ! CHECK: fir.call @_QPfoo(%[[I_PVT_DECL]]#1, %[[A_PVT_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
+  ! CHECK: omp.yield
+    do i= a, n
+      call foo(i, a)
+    end do
+  !$omp end parallel do
+  !CHECK: fir.call @_QPbar(%[[ARG1_DECL]]#1) {{.*}}: (!fir.ref<i32>) -> ()
+  call bar(n)
+end subroutine omp_do_firstprivate2

>From dffb1b19f0748aa226eccb1813a4ada843785f01 Mon Sep 17 00:00:00 2001
From: Amirreza Ashouri <ar.ashouri999 at gmail.com>
Date: Wed, 11 Oct 2023 18:42:15 +0330
Subject: [PATCH 09/38] [clang] __is_trivially_equality_comparable for types
 containing lambdas (#68506)

Lambdas (closure types) are trivially equality-comparable iff they are
non-capturing, because non-capturing lambdas are convertible to function
pointers: if (lam1 == lam2) compiles, then lam1 and lam2 must have
the same type, and be always-equal, and be empty.
---
 clang/include/clang/AST/DeclCXX.h     |  6 ++++++
 clang/lib/AST/DeclCXX.cpp             |  2 +-
 clang/lib/AST/Type.cpp                |  2 ++
 clang/lib/CodeGen/CodeGenFunction.cpp |  5 ++---
 clang/lib/Sema/SemaLambda.cpp         |  3 +--
 clang/test/SemaCXX/type-traits.cpp    | 24 +++++++++++++++++++++---
 6 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index aa3e3322faa42e3..5eaae6bdd2bc63e 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -1052,6 +1052,12 @@ class CXXRecordDecl : public RecordDecl {
     return static_cast<LambdaCaptureDefault>(getLambdaData().CaptureDefault);
   }
 
+  bool isCapturelessLambda() const {
+    if (!isLambda())
+      return false;
+    return getLambdaCaptureDefault() == LCD_None && capture_size() == 0;
+  }
+
   /// Set the captures for this lambda closure type.
   void setCaptures(ASTContext &Context, ArrayRef<LambdaCapture> Captures);
 
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index a92b788366434ce..9107525a44f22c2 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -686,7 +686,7 @@ bool CXXRecordDecl::lambdaIsDefaultConstructibleAndAssignable() const {
   // C++17 [expr.prim.lambda]p21:
   //   The closure type associated with a lambda-expression has no default
   //   constructor and a deleted copy assignment operator.
-  if (getLambdaCaptureDefault() != LCD_None || capture_size() != 0)
+  if (!isCapturelessLambda())
     return false;
   return getASTContext().getLangOpts().CPlusPlus20;
 }
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 4c433f7fe9daca0..282298971705ba0 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2663,6 +2663,8 @@ static bool
 HasNonDeletedDefaultedEqualityComparison(const CXXRecordDecl *Decl) {
   if (Decl->isUnion())
     return false;
+  if (Decl->isLambda())
+    return Decl->isCapturelessLambda();
 
   auto IsDefaultedOperatorEqualEqual = [&](const FunctionDecl *Function) {
     return Function->getOverloadedOperator() ==
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 9b21f428b0af7f5..42777194cc76dc0 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -1216,11 +1216,10 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
       SkippedChecks.set(SanitizerKind::ObjectSize, true);
       QualType ThisTy = MD->getThisType();
 
-      // If this is the call operator of a lambda with no capture-default, it
+      // If this is the call operator of a lambda with no captures, it
       // may have a static invoker function, which may call this operator with
       // a null 'this' pointer.
-      if (isLambdaCallOperator(MD) &&
-          MD->getParent()->getLambdaCaptureDefault() == LCD_None)
+      if (isLambdaCallOperator(MD) && MD->getParent()->isCapturelessLambda())
         SkippedChecks.set(SanitizerKind::Null, true);
 
       EmitTypeCheck(
diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp
index 421048aaff5c90c..ca09b0481bcac76 100644
--- a/clang/lib/Sema/SemaLambda.cpp
+++ b/clang/lib/Sema/SemaLambda.cpp
@@ -393,8 +393,7 @@ void Sema::DiagnoseInvalidExplicitObjectParameterInLambda(
   CXXRecordDecl *RD = Method->getParent();
   if (Method->getType()->isDependentType())
     return;
-  if (RD->getLambdaCaptureDefault() == LambdaCaptureDefault::LCD_None &&
-      RD->capture_size() == 0)
+  if (RD->isCapturelessLambda())
     return;
   QualType ExplicitObjectParameterType = Method->getParamDecl(0)
                                              ->getType()
diff --git a/clang/test/SemaCXX/type-traits.cpp b/clang/test/SemaCXX/type-traits.cpp
index a35689d52978fcc..275ddcbae73930d 100644
--- a/clang/test/SemaCXX/type-traits.cpp
+++ b/clang/test/SemaCXX/type-traits.cpp
@@ -3160,11 +3160,18 @@ static_assert(!__is_trivially_equality_comparable(float), "");
 static_assert(!__is_trivially_equality_comparable(double), "");
 static_assert(!__is_trivially_equality_comparable(long double), "");
 
-struct TriviallyEqualityComparableNoDefaultedComparator {
+struct NonTriviallyEqualityComparableNoComparator {
   int i;
   int j;
 };
-static_assert(!__is_trivially_equality_comparable(TriviallyEqualityComparableNoDefaultedComparator), "");
+static_assert(!__is_trivially_equality_comparable(NonTriviallyEqualityComparableNoComparator), "");
+
+struct NonTriviallyEqualityComparableNonDefaultedComparator {
+  int i;
+  int j;
+  bool operator==(const NonTriviallyEqualityComparableNonDefaultedComparator&);
+};
+static_assert(!__is_trivially_equality_comparable(NonTriviallyEqualityComparableNonDefaultedComparator), "");
 
 #if __cplusplus >= 202002L
 
@@ -3177,7 +3184,7 @@ struct TriviallyEqualityComparable {
 
   bool operator==(const TriviallyEqualityComparable&) const = default;
 };
-static_assert(__is_trivially_equality_comparable(TriviallyEqualityComparable), "");
+static_assert(__is_trivially_equality_comparable(TriviallyEqualityComparable));
 
 struct TriviallyEqualityComparableContainsArray {
   int a[4];
@@ -3193,6 +3200,17 @@ struct TriviallyEqualityComparableContainsMultiDimensionArray {
 };
 static_assert(__is_trivially_equality_comparable(TriviallyEqualityComparableContainsMultiDimensionArray));
 
+auto GetNonCapturingLambda() { return [](){ return 42; }; }
+
+struct TriviallyEqualityComparableContainsLambda {
+  [[no_unique_address]] decltype(GetNonCapturingLambda()) l;
+  int i;
+
+  bool operator==(const TriviallyEqualityComparableContainsLambda&) const = default;
+};
+static_assert(!__is_trivially_equality_comparable(decltype(GetNonCapturingLambda()))); // padding
+static_assert(__is_trivially_equality_comparable(TriviallyEqualityComparableContainsLambda));
+
 struct TriviallyEqualityComparableNonTriviallyCopyable {
   TriviallyEqualityComparableNonTriviallyCopyable(const TriviallyEqualityComparableNonTriviallyCopyable&);
   ~TriviallyEqualityComparableNonTriviallyCopyable();

>From e880ef8e59b596cd0f879c56b165f5894af5b228 Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann at nextsilicon.com>
Date: Wed, 11 Oct 2023 15:13:18 +0000
Subject: [PATCH 10/38] Revert "Reland: [MLIR][Transforms] Fix Mem2Reg removal
 order to respect dominance (#68767)"

This reverts commit 59fec735950bd9336ae2fd7560b300cb857033f2.

This reland did not properly fix the partial order.
---
 mlir/lib/Transforms/Mem2Reg.cpp       | 33 ++++++++++-----------------
 mlir/test/Dialect/LLVMIR/mem2reg.mlir | 13 -----------
 2 files changed, 12 insertions(+), 34 deletions(-)

diff --git a/mlir/lib/Transforms/Mem2Reg.cpp b/mlir/lib/Transforms/Mem2Reg.cpp
index 3132d5b2f82a6a0..65de25dd2f32663 100644
--- a/mlir/lib/Transforms/Mem2Reg.cpp
+++ b/mlir/lib/Transforms/Mem2Reg.cpp
@@ -96,9 +96,6 @@ using namespace mlir;
 
 namespace {
 
-using BlockingUsesMap =
-    llvm::MapVector<Operation *, SmallPtrSet<OpOperand *, 4>>;
-
 /// Information computed during promotion analysis used to perform actual
 /// promotion.
 struct MemorySlotPromotionInfo {
@@ -109,7 +106,7 @@ struct MemorySlotPromotionInfo {
   /// its uses, it is because the defining ops of the blocking uses requested
   /// it. The defining ops therefore must also have blocking uses or be the
   /// starting point of the bloccking uses.
-  BlockingUsesMap userToBlockingUses;
+  DenseMap<Operation *, SmallPtrSet<OpOperand *, 4>> userToBlockingUses;
 };
 
 /// Computes information for basic slot promotion. This will check that direct
@@ -132,7 +129,8 @@ class MemorySlotPromotionAnalyzer {
   /// uses (typically, removing its users because it will delete itself to
   /// resolve its own blocking uses). This will fail if one of the transitive
   /// users cannot remove a requested use, and should prevent promotion.
-  LogicalResult computeBlockingUses(BlockingUsesMap &userToBlockingUses);
+  LogicalResult computeBlockingUses(
+      DenseMap<Operation *, SmallPtrSet<OpOperand *, 4>> &userToBlockingUses);
 
   /// Computes in which blocks the value stored in the slot is actually used,
   /// meaning blocks leading to a load. This method uses `definingBlocks`, the
@@ -235,7 +233,7 @@ Value MemorySlotPromoter::getLazyDefaultValue() {
 }
 
 LogicalResult MemorySlotPromotionAnalyzer::computeBlockingUses(
-    BlockingUsesMap &userToBlockingUses) {
+    DenseMap<Operation *, SmallPtrSet<OpOperand *, 4>> &userToBlockingUses) {
   // The promotion of an operation may require the promotion of further
   // operations (typically, removing operations that use an operation that must
   // delete itself). We thus need to start from the use of the slot pointer and
@@ -245,7 +243,7 @@ LogicalResult MemorySlotPromotionAnalyzer::computeBlockingUses(
   // use it.
   for (OpOperand &use : slot.ptr.getUses()) {
     SmallPtrSet<OpOperand *, 4> &blockingUses =
-        userToBlockingUses[use.getOwner()];
+        userToBlockingUses.getOrInsertDefault(use.getOwner());
     blockingUses.insert(&use);
   }
 
@@ -283,7 +281,7 @@ LogicalResult MemorySlotPromotionAnalyzer::computeBlockingUses(
       assert(llvm::is_contained(user->getResults(), blockingUse->get()));
 
       SmallPtrSetImpl<OpOperand *> &newUserBlockingUseSet =
-          userToBlockingUses[blockingUse->getOwner()];
+          userToBlockingUses.getOrInsertDefault(blockingUse->getOwner());
       newUserBlockingUseSet.insert(blockingUse);
     }
   }
@@ -518,21 +516,14 @@ void MemorySlotPromoter::computeReachingDefInRegion(Region *region,
 }
 
 void MemorySlotPromoter::removeBlockingUses() {
-  llvm::SmallVector<Operation *> usersToRemoveUses(
-      llvm::make_first_range(info.userToBlockingUses));
-
-  // The uses need to be traversed in *reverse dominance* order to ensure that
-  // transitive replacements are performed correctly.
-  // NOTE: The order can be non-deterministic, due to a pointer comparision, but
-  // this has no effect on the result of the pattern. This is necessary to get a
-  // strict weak order relation.
-  llvm::sort(usersToRemoveUses, [&](Operation *lhs, Operation *rhs) {
-    return dominance.properlyDominates(rhs, lhs) ||
-           (!dominance.properlyDominates(lhs, rhs) && rhs < lhs);
-  });
+  llvm::SetVector<Operation *> usersToRemoveUses;
+  for (auto &user : llvm::make_first_range(info.userToBlockingUses))
+    usersToRemoveUses.insert(user);
+  SetVector<Operation *> sortedUsersToRemoveUses =
+      mlir::topologicalSort(usersToRemoveUses);
 
   llvm::SmallVector<Operation *> toErase;
-  for (Operation *toPromote : usersToRemoveUses) {
+  for (Operation *toPromote : llvm::reverse(sortedUsersToRemoveUses)) {
     if (auto toPromoteMemOp = dyn_cast<PromotableMemOpInterface>(toPromote)) {
       Value reachingDef = reachingDefs.lookup(toPromoteMemOp);
       // If no reaching definition is known, this use is outside the reach of
diff --git a/mlir/test/Dialect/LLVMIR/mem2reg.mlir b/mlir/test/Dialect/LLVMIR/mem2reg.mlir
index 32e3fed7e5485df..30ba459d07a49f3 100644
--- a/mlir/test/Dialect/LLVMIR/mem2reg.mlir
+++ b/mlir/test/Dialect/LLVMIR/mem2reg.mlir
@@ -683,16 +683,3 @@ llvm.func @no_inner_alloca_promotion(%arg: i64) -> i64 {
   // CHECK: llvm.return %[[RES]] : i64
   llvm.return %2 : i64
 }
-
-// -----
-
-// CHECK-LABEL: @transitive_reaching_def
-llvm.func @transitive_reaching_def() -> !llvm.ptr {
-  %0 = llvm.mlir.constant(1 : i32) : i32
-  // CHECK-NOT: alloca
-  %1 = llvm.alloca %0 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr
-  %2 = llvm.load %1 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr
-  llvm.store %2, %1 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr
-  %3 = llvm.load %1 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr
-  llvm.return %3 : !llvm.ptr
-}

>From b930f78d6dde90b87426107ddba90466a012a252 Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen at gmail.com>
Date: Wed, 11 Oct 2023 08:28:26 -0700
Subject: [PATCH 11/38] [MLIR][NVGPU] Introduce `warpgroup.init.accumulator` Op
 (#67530)

This Op generates and initilizes the accumulator matrix for
`nvgpu.warpgroup.mma` op to perform matrix-multiply-and-accumulate
(mma).

Its associated transformation generates `!llvm.struct<>` and fill it
with the initial values. The size of struct is number of required inout
registers for `nvgpu.warpgroup.mma` op.
---
 mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td   | 12 ++++
 .../Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp    | 30 ++++++++
 mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp    | 26 ++++++-
 .../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir | 71 +++++++++++++++++++
 4 files changed, 138 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index 57cd1a3806c2ed6..79183acfb71b61e 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -748,4 +748,16 @@ def NVGPU_WarpgroupMmaStoreOp : NVGPU_Op<"warpgroup.mma.store"> {
   let hasVerifier = 1;
 }
 
+def NVGPU_WarpgroupMmaInitAccumulatorOp : NVGPU_Op<"warpgroup.mma.init.accumulator"> {  
+  let summary = "Initializes the accumulator matrix";
+
+  let description = [{
+    This Op generates and initializes the accumulator matrix for 
+    `nvgpu.warpgroup.mma` op to perform matrix-multiply-and-accumulate.
+  }];
+  let results = (outs Variadic<NVGPU_WarpgroupAccumulator>:$matrixC);
+  let assemblyFormat = "attr-dict `->` type($matrixC)";
+  let hasVerifier = 1;
+}
+
 #endif // NVGPU
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index 99c4d4223351352..84f53a4572294ad 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -1546,6 +1546,35 @@ struct NVGPUWarpgroupMmaStoreOpLowering
   }
 };
 
+struct NVGPUWarpgroupMmaInitAccumulatorOpLowering
+    : public ConvertOpToLLVMPattern<nvgpu::WarpgroupMmaInitAccumulatorOp> {
+  using ConvertOpToLLVMPattern<
+      nvgpu::WarpgroupMmaInitAccumulatorOp>::ConvertOpToLLVMPattern;
+  LogicalResult
+  matchAndRewrite(nvgpu::WarpgroupMmaInitAccumulatorOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+    SmallVector<Value> results;
+    for (OpResult m : op.getMatrixC()) {
+      nvgpu::WarpgroupAccumulatorType mType =
+          m.getType().cast<nvgpu::WarpgroupAccumulatorType>();
+      Type stype = getTypeConverter()->convertType(mType);
+      Value undefStruct = b.create<LLVM::UndefOp>(stype);
+      Type elemType = mType.getFragmented().getElementType();
+      int64_t elemSize = mType.getFragmented().getDimSize(0);
+      Value zero =
+          b.create<LLVM::ConstantOp>(elemType, rewriter.getZeroAttr(elemType));
+      for (int64_t i = 0; i < elemSize; ++i) {
+        undefStruct = b.create<LLVM::InsertValueOp>(stype, undefStruct, zero,
+                                                    ArrayRef<int64_t>({i}));
+      }
+      results.push_back(undefStruct);
+    }
+    rewriter.replaceOp(op, results);
+    return success();
+  }
+};
+
 } // namespace
 
 void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
@@ -1563,6 +1592,7 @@ void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
       NVGPUGenerateWarpgroupDescriptorLowering, // nvgpu.warpgroup.generate.descriptor
       NVGPUWarpgroupMmaOpLowering,              // nvgpu.warpgroup.mma
       NVGPUWarpgroupMmaStoreOpLowering,         // nvgpu.warpgroup.mma.store
+      NVGPUWarpgroupMmaInitAccumulatorOpLowering, // nvgpu.warpgroup.mma.init.accumulator
       MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering,
       NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering,
       NVGPUMmaSparseSyncLowering>(converter);
diff --git a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
index e8ecd0faa4c86d3..fe71eae899cd63d 100644
--- a/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
+++ b/mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp
@@ -435,6 +435,8 @@ LogicalResult isAllowedWGMMADataType(Type typeD, Type typeA, Type typeB) {
   return failure();
 }
 
+LogicalResult isAllowedSizeM(int sizeM) { return success(sizeM == 64); }
+
 LogicalResult isAllowedSizeN(int sizeN, Type typeA) {
   SmallVector<int> allowedN = {8,   16,  24,  32,  40,  48,  56,  64,
                                72,  80,  88,  96,  104, 112, 120, 128,
@@ -443,7 +445,7 @@ LogicalResult isAllowedSizeN(int sizeN, Type typeA) {
   SmallVector<int> allowedNshort = {8,   16,  24,  32,  48,  64,
                                     80,  96,  112, 128, 144, 160,
                                     176, 192, 208, 224, 240, 256};
-  if (typeA.isBF16() || typeA.isF16() || typeA.isTF32() ||
+  if (typeA.isBF16() || typeA.isF16() || typeA.isF32() || typeA.isTF32() ||
       typeA.isFloat8E4M3FN() || typeA.isFloat8E5M2())
     if (llvm::is_contained(allowedN, sizeN))
       return success();
@@ -563,6 +565,28 @@ LogicalResult WarpgroupMmaStoreOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// WarpgroupMmaInitAccumulatorOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult WarpgroupMmaInitAccumulatorOp::verify() {
+  for (OpResult matrix : getMatrixC()) {
+    VectorType vectorType = matrix.getType()
+                                .cast<nvgpu::WarpgroupAccumulatorType>()
+                                .getFragmented();
+    // Check [M][N] shape
+    if (failed(isAllowedSizeM(vectorType.getDimSize(0))) ||
+        failed(isAllowedSizeN(vectorType.getDimSize(1),
+                              vectorType.getElementType()))) {
+      return emitOpError() << "has type " << vectorType
+                           << ". It does not fit into warp-group "
+                              "level (wgmma) matrix multiplication instruction "
+                              "(or not supported yet)";
+    }
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd dialect, type, and op definitions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index e54b62a06d4313a..ca030575e5e961e 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -901,6 +901,77 @@ func.func @warpgroup_mma_store(
   return 
 }
 
+func.func @warpgroup_mma_init() {
+  //CHECK: %[[S0:.+]] = llvm.mlir.undef : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
+  //CHECK: %[[S1:.+]] = llvm.mlir.constant(0.000000e+00 : f32) : f3
+  //CHECK: %[[S2:.+]] = llvm.insertvalue %[[S1]], %[[S0]][0] : !llvm.struct
+  //CHECK: %[[S3:.+]] = llvm.insertvalue %[[S1]], %[[S2]][1] : !llvm.struct
+  //CHECK: %[[S4:.+]] = llvm.insertvalue %[[S1]], %[[S3]][2] : !llvm.struct
+  //CHECK: %[[S5:.+]] = llvm.insertvalue %[[S1]], %[[S4]][3] : !llvm.struct
+  //CHECK: %[[S6:.+]] = llvm.insertvalue %[[S1]], %[[S5]][4] : !llvm.struct
+  //CHECK: %[[S7:.+]] = llvm.insertvalue %[[S1]], %[[S6]][5] : !llvm.struct
+  //CHECK: %[[S8:.+]] = llvm.insertvalue %[[S1]], %[[S7]][6] : !llvm.struct
+  //CHECK: %[[S9:.+]] = llvm.insertvalue %[[S1]], %[[S8]][7] : !llvm.struct
+  //CHECK: %[[S10:.+]] = llvm.insertvalue %[[S1]], %[[S9]][8] : !llvm.struct
+  //CHECK: %[[S11:.+]] = llvm.insertvalue %[[S1]], %[[S10]][9] : !llvm.struct
+  //CHECK: %[[S12:.+]] = llvm.insertvalue %[[S1]], %[[S11]][10] : !llvm.struct
+  //CHECK: %[[S13:.+]] = llvm.insertvalue %[[S1]], %[[S12]][11] : !llvm.struct
+  //CHECK: %[[S14:.+]] = llvm.insertvalue %[[S1]], %[[S13]][12] : !llvm.struct
+  //CHECK: %[[S15:.+]] = llvm.insertvalue %[[S1]], %[[S14]][13] : !llvm.struct
+  //CHECK: %[[S16:.+]] = llvm.insertvalue %[[S1]], %[[S15]][14] : !llvm.struct
+  //CHECK: %[[S17:.+]] = llvm.insertvalue %[[S1]], %[[S16]][15] : !llvm.struct
+  //CHECK: %[[S18:.+]] = llvm.insertvalue %[[S1]], %[[S17]][16] : !llvm.struct
+  //CHECK: %[[S19:.+]] = llvm.insertvalue %[[S1]], %[[S18]][17] : !llvm.struct
+  //CHECK: %[[S20:.+]] = llvm.insertvalue %[[S1]], %[[S19]][18] : !llvm.struct
+  //CHECK: %[[S21:.+]] = llvm.insertvalue %[[S1]], %[[S20]][19] : !llvm.struct
+  //CHECK: %[[S22:.+]] = llvm.insertvalue %[[S1]], %[[S21]][20] : !llvm.struct
+  //CHECK: %[[S23:.+]] = llvm.insertvalue %[[S1]], %[[S22]][21] : !llvm.struct
+  //CHECK: %[[S24:.+]] = llvm.insertvalue %[[S1]], %[[S23]][22] : !llvm.struct
+  //CHECK: %[[S25:.+]] = llvm.insertvalue %[[S1]], %[[S24]][23] : !llvm.struct
+  //CHECK: %[[S26:.+]] = llvm.insertvalue %[[S1]], %[[S25]][24] : !llvm.struct
+  //CHECK: %[[S27:.+]] = llvm.insertvalue %[[S1]], %[[S26]][25] : !llvm.struct
+  //CHECK: %[[S28:.+]] = llvm.insertvalue %[[S1]], %[[S27]][26] : !llvm.struct
+  //CHECK: %[[S29:.+]] = llvm.insertvalue %[[S1]], %[[S28]][27] : !llvm.struct
+  //CHECK: %[[S30:.+]] = llvm.insertvalue %[[S1]], %[[S29]][28] : !llvm.struct
+  //CHECK: %[[S31:.+]] = llvm.insertvalue %[[S1]], %[[S30]][29] : !llvm.struct
+  //CHECK: %[[S32:.+]] = llvm.insertvalue %[[S1]], %[[S31]][30] : !llvm.struct
+  //CHECK: %[[S33:.+]] = llvm.insertvalue %[[S1]], %[[S32]][31] : !llvm.struct
+  //CHECK: %[[S34:.+]] = llvm.insertvalue %[[S1]], %[[S33]][32] : !llvm.struct
+  //CHECK: %[[S35:.+]] = llvm.insertvalue %[[S1]], %[[S34]][33] : !llvm.struct
+  //CHECK: %[[S36:.+]] = llvm.insertvalue %[[S1]], %[[S35]][34] : !llvm.struct
+  //CHECK: %[[S37:.+]] = llvm.insertvalue %[[S1]], %[[S36]][35] : !llvm.struct
+  //CHECK: %[[S38:.+]] = llvm.insertvalue %[[S1]], %[[S37]][36] : !llvm.struct
+  //CHECK: %[[S39:.+]] = llvm.insertvalue %[[S1]], %[[S38]][37] : !llvm.struct
+  //CHECK: %[[S40:.+]] = llvm.insertvalue %[[S1]], %[[S39]][38] : !llvm.struct
+  //CHECK: %[[S41:.+]] = llvm.insertvalue %[[S1]], %[[S40]][39] : !llvm.struct
+  //CHECK: %[[S42:.+]] = llvm.insertvalue %[[S1]], %[[S41]][40] : !llvm.struct
+  //CHECK: %[[S43:.+]] = llvm.insertvalue %[[S1]], %[[S42]][41] : !llvm.struct
+  //CHECK: %[[S44:.+]] = llvm.insertvalue %[[S1]], %[[S43]][42] : !llvm.struct
+  //CHECK: %[[S45:.+]] = llvm.insertvalue %[[S1]], %[[S44]][43] : !llvm.struct
+  //CHECK: %[[S46:.+]] = llvm.insertvalue %[[S1]], %[[S45]][44] : !llvm.struct
+  //CHECK: %[[S47:.+]] = llvm.insertvalue %[[S1]], %[[S46]][45] : !llvm.struct
+  //CHECK: %[[S48:.+]] = llvm.insertvalue %[[S1]], %[[S47]][46] : !llvm.struct
+  //CHECK: %[[S49:.+]] = llvm.insertvalue %[[S1]], %[[S48]][47] : !llvm.struct
+  //CHECK: %[[S50:.+]] = llvm.insertvalue %[[S1]], %[[S49]][48] : !llvm.struct
+  //CHECK: %[[S51:.+]] = llvm.insertvalue %[[S1]], %[[S50]][49] : !llvm.struct
+  //CHECK: %[[S52:.+]] = llvm.insertvalue %[[S1]], %[[S51]][50] : !llvm.struct
+  //CHECK: %[[S53:.+]] = llvm.insertvalue %[[S1]], %[[S52]][51] : !llvm.struct
+  //CHECK: %[[S54:.+]] = llvm.insertvalue %[[S1]], %[[S53]][52] : !llvm.struct
+  //CHECK: %[[S55:.+]] = llvm.insertvalue %[[S1]], %[[S54]][53] : !llvm.struct
+  //CHECK: %[[S56:.+]] = llvm.insertvalue %[[S1]], %[[S55]][54] : !llvm.struct
+  //CHECK: %[[S57:.+]] = llvm.insertvalue %[[S1]], %[[S56]][55] : !llvm.struct
+  //CHECK: %[[S58:.+]] = llvm.insertvalue %[[S1]], %[[S57]][56] : !llvm.struct
+  //CHECK: %[[S59:.+]] = llvm.insertvalue %[[S1]], %[[S58]][57] : !llvm.struct
+  //CHECK: %[[S60:.+]] = llvm.insertvalue %[[S1]], %[[S59]][58] : !llvm.struct
+  //CHECK: %[[S61:.+]] = llvm.insertvalue %[[S1]], %[[S60]][59] : !llvm.struct
+  //CHECK: %[[S62:.+]] = llvm.insertvalue %[[S1]], %[[S61]][60] : !llvm.struct
+  //CHECK: %[[S63:.+]] = llvm.insertvalue %[[S1]], %[[S62]][61] : !llvm.struct
+  //CHECK: %[[S64:.+]] = llvm.insertvalue %[[S1]], %[[S63]][62] : !llvm.struct
+  //CHECK: %[[S65:.+]] = llvm.insertvalue %[[S1]], %[[S64]][63] : !llvm.struct
+  %matrixC = nvgpu.warpgroup.mma.init.accumulator -> !nvgpu.warpgroup.accumulator< fragmented = vector<64x128xf32>>
+  return 
+}
+
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !transform.any_op):
   %0 = transform.structured.match ops{["func.func"]} in %arg1 

>From 71be91312b7e65795bfd29b8b2b94f1a8794f9fe Mon Sep 17 00:00:00 2001
From: Shraiysh <Shraiysh.Vaishay at amd.com>
Date: Wed, 11 Oct 2023 11:36:03 -0400
Subject: [PATCH 12/38] [OpenMPIRBuilder] Add ThreadLimit and NumTeams clauses
 to teams construct (#68364)

This patch adds support for `thread_limit` and bounds on `num_teams`
clause for the teams construct in OpenMP.

Added testcases for the same.
---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  11 +-
 .../include/llvm/Frontend/OpenMP/OMPKinds.def |   1 +
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  23 +-
 .../Frontend/OpenMPIRBuilderTest.cpp          | 210 ++++++++++++++++++
 4 files changed, 243 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index b505daae7e75f80..9d2adf229b78654 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1917,8 +1917,17 @@ class OpenMPIRBuilder {
   ///
   /// \param Loc The location where the teams construct was encountered.
   /// \param BodyGenCB Callback that will generate the region code.
+  /// \param NumTeamsLower Lower bound on number of teams. If this is nullptr,
+  ///        it is as if lower bound is specified as equal to upperbound. If
+  ///        this is non-null, then upperbound must also be non-null.
+  /// \param NumTeamsUpper Upper bound on the number of teams.
+  /// \param ThreadLimit on the number of threads that may participate in a
+  ///        contention group created by each team.
   InsertPointTy createTeams(const LocationDescription &Loc,
-                            BodyGenCallbackTy BodyGenCB);
+                            BodyGenCallbackTy BodyGenCB,
+                            Value *NumTeamsLower = nullptr,
+                            Value *NumTeamsUpper = nullptr,
+                            Value *ThreadLimit = nullptr);
 
   /// Generate conditional branch and relevant BasicBlocks through which private
   /// threads copy the 'copyin' variables from Master copy to threadprivate
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 176b883fe68f7ad..4823c4cc6b833ec 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -387,6 +387,7 @@ __OMP_RTL(__kmpc_cancellationpoint, false, Int32, IdentPtr, Int32, Int32)
 
 __OMP_RTL(__kmpc_fork_teams, true, Void, IdentPtr, Int32, ParallelTaskPtr)
 __OMP_RTL(__kmpc_push_num_teams, false, Void, IdentPtr, Int32, Int32, Int32)
+__OMP_RTL(__kmpc_push_num_teams_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32)
 __OMP_RTL(__kmpc_set_thread_limit, false, Void, IdentPtr, Int32, Int32)
 
 __OMP_RTL(__kmpc_copyprivate, false, Void, IdentPtr, Int32, SizeTy, VoidPtr,
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 8e240ba96050082..a658990f2d45355 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -5733,7 +5733,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
 
 OpenMPIRBuilder::InsertPointTy
 OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
-                             BodyGenCallbackTy BodyGenCB) {
+                             BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
+                             Value *NumTeamsUpper, Value *ThreadLimit) {
   if (!updateToLocation(Loc))
     return InsertPointTy();
 
@@ -5771,6 +5772,26 @@ OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
   BasicBlock *AllocaBB =
       splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
 
+  // Push num_teams
+  if (NumTeamsLower || NumTeamsUpper || ThreadLimit) {
+    assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
+           "if lowerbound is non-null, then upperbound must also be non-null "
+           "for bounds on num_teams");
+
+    if (NumTeamsUpper == nullptr)
+      NumTeamsUpper = Builder.getInt32(0);
+
+    if (NumTeamsLower == nullptr)
+      NumTeamsLower = NumTeamsUpper;
+
+    if (ThreadLimit == nullptr)
+      ThreadLimit = Builder.getInt32(0);
+
+    Value *ThreadNum = getOrCreateThreadID(Ident);
+    Builder.CreateCall(
+        getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
+        {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
+  }
   // Generate the body of teams.
   InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
   InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index c60aca259d1859f..d770facc1730252 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -4074,6 +4074,216 @@ TEST_F(OpenMPIRBuilderTest, CreateTeams) {
                      [](Instruction &inst) { return isa<ICmpInst>(&inst); }));
 }
 
+TEST_F(OpenMPIRBuilderTest, CreateTeamsWithThreadLimit) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.initialize();
+  F->setName("func");
+  IRBuilder<> &Builder = OMPBuilder.Builder;
+  Builder.SetInsertPoint(BB);
+
+  Function *FakeFunction =
+      Function::Create(FunctionType::get(Builder.getVoidTy(), false),
+                       GlobalValue::ExternalLinkage, "fakeFunction", M.get());
+
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+    Builder.restoreIP(CodeGenIP);
+    Builder.CreateCall(FakeFunction, {});
+  };
+
+  // `F` has an argument - an integer, so we use that as the thread limit.
+  Builder.restoreIP(OMPBuilder.createTeams(/*=*/Builder, BodyGenCB,
+                                           /*NumTeamsLower=*/nullptr,
+                                           /*NumTeamsUpper=*/nullptr,
+                                           /*ThreadLimit=*/F->arg_begin()));
+
+  Builder.CreateRetVoid();
+  OMPBuilder.finalize();
+
+  ASSERT_FALSE(verifyModule(*M));
+
+  CallInst *PushNumTeamsCallInst =
+      findSingleCall(F, OMPRTL___kmpc_push_num_teams_51, OMPBuilder);
+  ASSERT_NE(PushNumTeamsCallInst, nullptr);
+
+  EXPECT_EQ(PushNumTeamsCallInst->getArgOperand(2), Builder.getInt32(0));
+  EXPECT_EQ(PushNumTeamsCallInst->getArgOperand(3), Builder.getInt32(0));
+  EXPECT_EQ(PushNumTeamsCallInst->getArgOperand(4), &*F->arg_begin());
+
+  // Verifying that the next instruction to execute is kmpc_fork_teams
+  BranchInst *BrInst =
+      dyn_cast<BranchInst>(PushNumTeamsCallInst->getNextNonDebugInstruction());
+  ASSERT_NE(BrInst, nullptr);
+  ASSERT_EQ(BrInst->getNumSuccessors(), 1U);
+  Instruction *NextInstruction =
+      BrInst->getSuccessor(0)->getFirstNonPHIOrDbgOrLifetime();
+  CallInst *ForkTeamsCI = dyn_cast_if_present<CallInst>(NextInstruction);
+  ASSERT_NE(ForkTeamsCI, nullptr);
+  EXPECT_EQ(ForkTeamsCI->getCalledFunction(),
+            OMPBuilder.getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_teams));
+}
+
+TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsUpper) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.initialize();
+  F->setName("func");
+  IRBuilder<> &Builder = OMPBuilder.Builder;
+  Builder.SetInsertPoint(BB);
+
+  Function *FakeFunction =
+      Function::Create(FunctionType::get(Builder.getVoidTy(), false),
+                       GlobalValue::ExternalLinkage, "fakeFunction", M.get());
+
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+    Builder.restoreIP(CodeGenIP);
+    Builder.CreateCall(FakeFunction, {});
+  };
+
+  // `F` already has an integer argument, so we use that as upper bound to
+  // `num_teams`
+  Builder.restoreIP(OMPBuilder.createTeams(Builder, BodyGenCB,
+                                           /*NumTeamsLower=*/nullptr,
+                                           /*NumTeamsUpper=*/F->arg_begin()));
+
+  Builder.CreateRetVoid();
+  OMPBuilder.finalize();
+
+  ASSERT_FALSE(verifyModule(*M));
+
+  CallInst *PushNumTeamsCallInst =
+      findSingleCall(F, OMPRTL___kmpc_push_num_teams_51, OMPBuilder);
+  ASSERT_NE(PushNumTeamsCallInst, nullptr);
+
+  EXPECT_EQ(PushNumTeamsCallInst->getArgOperand(2), &*F->arg_begin());
+  EXPECT_EQ(PushNumTeamsCallInst->getArgOperand(3), &*F->arg_begin());
+  EXPECT_EQ(PushNumTeamsCallInst->getArgOperand(4), Builder.getInt32(0));
+
+  // Verifying that the next instruction to execute is kmpc_fork_teams
+  BranchInst *BrInst =
+      dyn_cast<BranchInst>(PushNumTeamsCallInst->getNextNonDebugInstruction());
+  ASSERT_NE(BrInst, nullptr);
+  ASSERT_EQ(BrInst->getNumSuccessors(), 1U);
+  Instruction *NextInstruction =
+      BrInst->getSuccessor(0)->getFirstNonPHIOrDbgOrLifetime();
+  CallInst *ForkTeamsCI = dyn_cast_if_present<CallInst>(NextInstruction);
+  ASSERT_NE(ForkTeamsCI, nullptr);
+  EXPECT_EQ(ForkTeamsCI->getCalledFunction(),
+            OMPBuilder.getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_teams));
+}
+
+TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsBoth) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.initialize();
+  F->setName("func");
+  IRBuilder<> &Builder = OMPBuilder.Builder;
+  Builder.SetInsertPoint(BB);
+
+  Function *FakeFunction =
+      Function::Create(FunctionType::get(Builder.getVoidTy(), false),
+                       GlobalValue::ExternalLinkage, "fakeFunction", M.get());
+
+  Value *NumTeamsLower =
+      Builder.CreateAdd(F->arg_begin(), Builder.getInt32(5), "numTeamsLower");
+  Value *NumTeamsUpper =
+      Builder.CreateAdd(F->arg_begin(), Builder.getInt32(10), "numTeamsUpper");
+
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+    Builder.restoreIP(CodeGenIP);
+    Builder.CreateCall(FakeFunction, {});
+  };
+
+  // `F` already has an integer argument, so we use that as upper bound to
+  // `num_teams`
+  Builder.restoreIP(
+      OMPBuilder.createTeams(Builder, BodyGenCB, NumTeamsLower, NumTeamsUpper));
+
+  Builder.CreateRetVoid();
+  OMPBuilder.finalize();
+
+  ASSERT_FALSE(verifyModule(*M));
+
+  CallInst *PushNumTeamsCallInst =
+      findSingleCall(F, OMPRTL___kmpc_push_num_teams_51, OMPBuilder);
+  ASSERT_NE(PushNumTeamsCallInst, nullptr);
+
+  EXPECT_EQ(PushNumTeamsCallInst->getArgOperand(2), NumTeamsLower);
+  EXPECT_EQ(PushNumTeamsCallInst->getArgOperand(3), NumTeamsUpper);
+  EXPECT_EQ(PushNumTeamsCallInst->getArgOperand(4), Builder.getInt32(0));
+
+  // Verifying that the next instruction to execute is kmpc_fork_teams
+  BranchInst *BrInst =
+      dyn_cast<BranchInst>(PushNumTeamsCallInst->getNextNonDebugInstruction());
+  ASSERT_NE(BrInst, nullptr);
+  ASSERT_EQ(BrInst->getNumSuccessors(), 1U);
+  Instruction *NextInstruction =
+      BrInst->getSuccessor(0)->getFirstNonPHIOrDbgOrLifetime();
+  CallInst *ForkTeamsCI = dyn_cast_if_present<CallInst>(NextInstruction);
+  ASSERT_NE(ForkTeamsCI, nullptr);
+  EXPECT_EQ(ForkTeamsCI->getCalledFunction(),
+            OMPBuilder.getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_teams));
+}
+
+TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsAndThreadLimit) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.initialize();
+  F->setName("func");
+  IRBuilder<> &Builder = OMPBuilder.Builder;
+  Builder.SetInsertPoint(BB);
+
+  BasicBlock *CodegenBB = splitBB(Builder, true);
+  Builder.SetInsertPoint(CodegenBB);
+
+  // Generate values for `num_teams` and `thread_limit` using the first argument
+  // of the testing function.
+  Value *NumTeamsLower =
+      Builder.CreateAdd(F->arg_begin(), Builder.getInt32(5), "numTeamsLower");
+  Value *NumTeamsUpper =
+      Builder.CreateAdd(F->arg_begin(), Builder.getInt32(10), "numTeamsUpper");
+  Value *ThreadLimit =
+      Builder.CreateAdd(F->arg_begin(), Builder.getInt32(20), "threadLimit");
+
+  Function *FakeFunction =
+      Function::Create(FunctionType::get(Builder.getVoidTy(), false),
+                       GlobalValue::ExternalLinkage, "fakeFunction", M.get());
+
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+    Builder.restoreIP(CodeGenIP);
+    Builder.CreateCall(FakeFunction, {});
+  };
+
+  OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
+  Builder.restoreIP(OMPBuilder.createTeams(Builder, BodyGenCB, NumTeamsLower,
+                                           NumTeamsUpper, ThreadLimit));
+
+  Builder.CreateRetVoid();
+  OMPBuilder.finalize();
+
+  ASSERT_FALSE(verifyModule(*M));
+
+  CallInst *PushNumTeamsCallInst =
+      findSingleCall(F, OMPRTL___kmpc_push_num_teams_51, OMPBuilder);
+  ASSERT_NE(PushNumTeamsCallInst, nullptr);
+
+  EXPECT_EQ(PushNumTeamsCallInst->getArgOperand(2), NumTeamsLower);
+  EXPECT_EQ(PushNumTeamsCallInst->getArgOperand(3), NumTeamsUpper);
+  EXPECT_EQ(PushNumTeamsCallInst->getArgOperand(4), ThreadLimit);
+
+  // Verifying that the next instruction to execute is kmpc_fork_teams
+  BranchInst *BrInst =
+      dyn_cast<BranchInst>(PushNumTeamsCallInst->getNextNonDebugInstruction());
+  ASSERT_NE(BrInst, nullptr);
+  ASSERT_EQ(BrInst->getNumSuccessors(), 1U);
+  Instruction *NextInstruction =
+      BrInst->getSuccessor(0)->getFirstNonPHIOrDbgOrLifetime();
+  CallInst *ForkTeamsCI = dyn_cast_if_present<CallInst>(NextInstruction);
+  ASSERT_NE(ForkTeamsCI, nullptr);
+  EXPECT_EQ(ForkTeamsCI->getCalledFunction(),
+            OMPBuilder.getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_teams));
+}
+
 /// Returns the single instruction of InstTy type in BB that uses the value V.
 /// If there is more than one such instruction, returns null.
 template <typename InstTy>

>From 289034ca7feaea09752645443fdf6d4c0e6ffc29 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Wed, 11 Oct 2023 15:36:33 +0000
Subject: [PATCH 13/38] [flang] add missing dependency FIRTransforms ->
 FIRAnalysis

---
 flang/lib/Optimizer/Transforms/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 74a093fe74719b6..98314fa7a2087fe 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -28,6 +28,7 @@ add_flang_library(FIRTransforms
   HLFIROpsIncGen
 
   LINK_LIBS
+  FIRAnalysis
   FIRBuilder
   FIRCodeGen
   FIRDialect

>From 5fd2ee40d9b14fed70ccdb2b5b3c38f215979664 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 11 Oct 2023 16:37:47 +0100
Subject: [PATCH 14/38] [SlotIndexes] Use upper/lower bound terminology for MBB
 searches. NFC. (#68802)

Rename advanceMBBIndex and findMBBIndex to getMBBLowerBound and add
getMBBUpperBound.

The motivations are:
- Make it clear what kind of search is being done, using names inspired
  by std::upper/lower_bound.
- Simplify getMBBFromIndex which really wants an upper bound search and
  previously had to work hard to get the result it wanted from a lower
  bound search.
---
 llvm/include/llvm/CodeGen/SlotIndexes.h | 40 +++++++++++++------------
 llvm/lib/CodeGen/VirtRegMap.cpp         |  4 +--
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SlotIndexes.h b/llvm/include/llvm/CodeGen/SlotIndexes.h
index 1517d33e65a3801..828165c8b887b34 100644
--- a/llvm/include/llvm/CodeGen/SlotIndexes.h
+++ b/llvm/include/llvm/CodeGen/SlotIndexes.h
@@ -472,18 +472,25 @@ class raw_ostream;
     /// begin and basic block)
     using MBBIndexIterator = SmallVectorImpl<IdxMBBPair>::const_iterator;
 
-    /// Move iterator to the next IdxMBBPair where the SlotIndex is greater or
-    /// equal to \p To.
-    MBBIndexIterator advanceMBBIndex(MBBIndexIterator I, SlotIndex To) const {
-      return std::partition_point(
-          I, idx2MBBMap.end(),
-          [=](const IdxMBBPair &IM) { return IM.first < To; });
+    /// Get an iterator pointing to the first IdxMBBPair with SlotIndex greater
+    /// than or equal to \p Idx. If \p Start is provided, only search the range
+    /// from \p Start to the end of the function.
+    MBBIndexIterator getMBBLowerBound(MBBIndexIterator Start,
+                                      SlotIndex Idx) const {
+      return std::lower_bound(
+          Start, MBBIndexEnd(), Idx,
+          [](const IdxMBBPair &IM, SlotIndex Idx) { return IM.first < Idx; });
+    }
+    MBBIndexIterator getMBBLowerBound(SlotIndex Idx) const {
+      return getMBBLowerBound(MBBIndexBegin(), Idx);
     }
 
-    /// Get an iterator pointing to the IdxMBBPair with the biggest SlotIndex
-    /// that is greater or equal to \p Idx.
-    MBBIndexIterator findMBBIndex(SlotIndex Idx) const {
-      return advanceMBBIndex(idx2MBBMap.begin(), Idx);
+    /// Get an iterator pointing to the first IdxMBBPair with SlotIndex greater
+    /// than \p Idx.
+    MBBIndexIterator getMBBUpperBound(SlotIndex Idx) const {
+      return std::upper_bound(
+          MBBIndexBegin(), MBBIndexEnd(), Idx,
+          [](SlotIndex Idx, const IdxMBBPair &IM) { return Idx < IM.first; });
     }
 
     /// Returns an iterator for the begin of the idx2MBBMap.
@@ -501,16 +508,11 @@ class raw_ostream;
       if (MachineInstr *MI = getInstructionFromIndex(index))
         return MI->getParent();
 
-      MBBIndexIterator I = findMBBIndex(index);
-      // Take the pair containing the index
-      MBBIndexIterator J =
-        ((I != MBBIndexEnd() && I->first > index) ||
-         (I == MBBIndexEnd() && !idx2MBBMap.empty())) ? std::prev(I) : I;
-
-      assert(J != MBBIndexEnd() && J->first <= index &&
-             index < getMBBEndIdx(J->second) &&
+      MBBIndexIterator I = std::prev(getMBBUpperBound(index));
+      assert(I != MBBIndexEnd() && I->first <= index &&
+             index < getMBBEndIdx(I->second) &&
              "index does not correspond to an MBB");
-      return J->second;
+      return I->second;
     }
 
     /// Insert the given machine instruction into the mapping. Returns the
diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp
index 5bdd86aebcd034e..48f4ee29fbe95d4 100644
--- a/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -312,7 +312,7 @@ void VirtRegRewriter::addLiveInsForSubRanges(const LiveInterval &LI,
 
   // Check all mbb start positions between First and Last while
   // simultaneously advancing an iterator for each subrange.
-  for (SlotIndexes::MBBIndexIterator MBBI = Indexes->findMBBIndex(First);
+  for (SlotIndexes::MBBIndexIterator MBBI = Indexes->getMBBLowerBound(First);
        MBBI != Indexes->MBBIndexEnd() && MBBI->first <= Last; ++MBBI) {
     SlotIndex MBBBegin = MBBI->first;
     // Advance all subrange iterators so that their end position is just
@@ -363,7 +363,7 @@ void VirtRegRewriter::addMBBLiveIns() {
       // sorted by slot indexes.
       SlotIndexes::MBBIndexIterator I = Indexes->MBBIndexBegin();
       for (const auto &Seg : LI) {
-        I = Indexes->advanceMBBIndex(I, Seg.start);
+        I = Indexes->getMBBLowerBound(I, Seg.start);
         for (; I != Indexes->MBBIndexEnd() && I->first < Seg.end; ++I) {
           MachineBasicBlock *MBB = I->second;
           MBB->addLiveIn(PhysReg);

>From 7c749574cd657cf37b004010abc2e199b3ebf08d Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 11 Oct 2023 16:04:42 +0100
Subject: [PATCH 15/38] [SlotIndexes] Simplify getInstructionFromIndex. NFCI.

This method should never be called on an invalid index.
---
 llvm/include/llvm/CodeGen/SlotIndexes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/SlotIndexes.h b/llvm/include/llvm/CodeGen/SlotIndexes.h
index 828165c8b887b34..72f4a6876b6cb14 100644
--- a/llvm/include/llvm/CodeGen/SlotIndexes.h
+++ b/llvm/include/llvm/CodeGen/SlotIndexes.h
@@ -387,7 +387,7 @@ class raw_ostream;
     /// Returns the instruction for the given index, or null if the given
     /// index has no instruction associated with it.
     MachineInstr* getInstructionFromIndex(SlotIndex index) const {
-      return index.isValid() ? index.listEntry()->getInstr() : nullptr;
+      return index.listEntry()->getInstr();
     }
 
     /// Returns the next non-null index, if one exists.

>From 38c02710b7e2c592ef6327e641435559e0c93d83 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me at m-sp.org>
Date: Wed, 11 Oct 2023 08:51:16 -0700
Subject: [PATCH 16/38] [mlir][bufferization] Transfer `restrict` during empty
 tensor elimination (#68729)

Empty tensor elimination is looking for
`bufferization.materialize_in_destination` ops with a `tensor.empty`
source. It replaces the `tensor.empty` with a `bufferization.to_tensor
restrict` of the memref destination. As part of this rewrite, the
`restrict` keyword should be removed, so that no second `to_tensor
restrict` op will be inserted. Such IR would be invalid.
`bufferization.materialize_in_destination` with memref destination and
without the `restrict` attribute are ignored by empty tensor
elimination.

Also relax the verifier of `materialize_in_destination`. The `restrict`
keyword is not generally needed because the op does not expose the
buffer as a tensor.
---
 .../Bufferization/IR/BufferizationOps.td      | 40 ++++++++++---------
 .../Bufferization/IR/BufferizationOps.cpp     | 15 +++++--
 ...ot-bufferize-empty-tensor-elimination.mlir | 24 +++++++++++
 mlir/test/Dialect/Bufferization/invalid.mlir  |  9 +----
 4 files changed, 58 insertions(+), 30 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
index 34a6f5d74b13956..c779d1f843d76a0 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
@@ -246,28 +246,32 @@ def Bufferization_MaterializeInDestinationOp
     rewrite IR such that a computation is performed directly in `dest` and no
     memcpy is needed.
 
-    If `dest` is a buffer, the `restrict` and `writable` attributes must be
-    specified. These attributes have the same meaning as the respective
-    attributes of `bufferization.to_tensor`. `writable` indicates that the
-    `dest` buffer is considered writable. It does not make sense to materialize
-    a computation in a read-only buffer, so `writable` is required. `restrict`
-    indicates that this op is the only way for the tensor IR to access `dest`
-    (or an alias thereof). E.g., there must be no other `to_tensor` ops with
-    `dest` or with an alias of `dest`. Such IR is not supported by
-    One-Shot Bufferize. Ops that have incorrect usage of `restrict` may
-    bufferize incorrectly.
-
-    Note: `restrict` and `writable` could be removed from this op because they
-    must always be set for memref destinations. This op has these attributes to
-    make clear the requirements on the `dest` operand in the op assembly format.
-    Moreover, these requirements may be relaxed at some point in the future.
+    If `dest` is a buffer, the `writable` attribute must be specified and the
+    `restrict` keyword can be specified. These attributes have the same meaning
+    as the respective attributes of `bufferization.to_tensor`.
+
+    `writable` indicates that the `dest` buffer is considered writable. It does
+    not make sense to materialize a computation in a read-only buffer, so
+    `writable` is required.
+
+    `restrict` indicates that there is no `bufferization.to_tensor` op and no
+    other `bufferization.materialize_in_destination` op with `dest` (or an alias
+    thereof) and "restrict". Only ops with this attribute are considered for
+    "empty tensor elimination". As part of empty tensor elimination, a new
+    `to_tensor` op with `dest` may be inserted and the `restrict` attribute is
+    transferred from this op to the new `to_tensor` op. Having "restrict" on
+    this op guarantees that performing empty tensor elimination would not create
+    invalid IR (i.e., having multiple `to_tensor restrict` with aliasing
+    buffers).
+
+    Note: `writable` could be removed from this op because it must always be set
+    for memref destinations. This op has that attribute to make clear the
+    requirements on the `dest` operand in the op assembly format.
 
     Note: If `dest` is a tensor, `tensor.insert_slice` could be used for the
     same purpose, but since tensor dialect ops only indicate *what* should be
     computed but not *where*, it could fold away, causing the computation to
-    materialize in a different buffer. It is also possible that the
-    `tensor.insert_slice` destination bufferizes out-of-place, which would also
-    cause the computation to materialize in a buffer different buffer.
+    materialize in a different buffer.
   }];
 
   let arguments = (ins AnyTensor:$source, AnyShaped:$dest,
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
index 738c8374d7add03..5716dcc9d905016 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
@@ -613,11 +613,19 @@ Value MaterializeInDestinationOp::buildSubsetExtraction(OpBuilder &builder,
     return getDest();
   }
 
+  // The "restrict" attribute is transferred from this op to the newly created
+  // to_tensor op. If this op does not the "restrict" attribute, the subset
+  // extraction cannot be built because there is no guarantee that there is no
+  // pre-existing "restrict" to_tensor op with the same/an aliasing destination.
+  if (!getRestrict())
+    return {};
+
   // Build a bufferization.to_tensor op.
   assert(isa<BaseMemRefType>(getDest().getType()) && "expected memref type");
   assert(getRestrict() &&
          "expected that ops with memrefs dest have 'restrict'");
-  return builder.create<ToTensorOp>(loc, getDest(), getRestrict(),
+  setRestrict(false);
+  return builder.create<ToTensorOp>(loc, getDest(), /*restrict=*/true,
                                     getWritable());
 }
 
@@ -647,9 +655,8 @@ LogicalResult MaterializeInDestinationOp::verify() {
   if (isa<BaseMemRefType>(getDest().getType()) &&
       getOperation()->getNumResults() != 0)
     return emitOpError("memref 'dest' implies zero results");
-  if (getRestrict() != isa<BaseMemRefType>(getDest().getType()))
-    return emitOpError("'restrict' must be specified if and only if the "
-                       "destination is of memref type");
+  if (getRestrict() && !isa<BaseMemRefType>(getDest().getType()))
+    return emitOpError("'restrict' is valid only for memref destinations");
   if (getWritable() != isa<BaseMemRefType>(getDest().getType()))
     return emitOpError("'writable' must be specified if and only if the "
                        "destination is of memref type");
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
index 99b974b9ef3c67e..9a3e14b6d391782 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-opt %s -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -one-shot-bufferize="bufferize-function-boundaries" -cse -canonicalize -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -eliminate-empty-tensors | FileCheck %s --check-prefix=CHECK-ELIM
 
 //      CHECK: func @buffer_forwarding_conflict(
 // CHECK-SAME:   %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref<?xf32>
@@ -341,3 +342,26 @@ func.func @linalg_copy_empty() -> tensor<26xi32> {
   %1 = linalg.copy ins(%0 : tensor<26xi32>) outs(%0 : tensor<26xi32>) -> tensor<26xi32>
   return %1 : tensor<26xi32>
 }
+
+// -----
+
+// CHECK-ELIM-LABEL: func @multiple_materialize_in_destination_buffer(
+//  CHECK-ELIM-SAME:     %[[m:.*]]: memref<5xf32>
+//       CHECK-ELIM:   tensor.empty
+//       CHECK-ELIM:   bufferization.to_tensor %[[m]] restrict writable
+//       CHECK-ELIM:   bufferization.materialize_in_destination {{.*}} in writable %[[m]]
+func.func @multiple_materialize_in_destination_buffer(%m: memref<5xf32>, %f: f32, %f2: f32, %c: i1) {
+  %0 = tensor.empty() : tensor<5xf32>
+  %filled = linalg.fill ins(%f : f32) outs(%0 : tensor<5xf32>) -> tensor<5xf32>
+
+  %1 = tensor.empty() : tensor<5xf32>
+  %filled2 = linalg.fill ins(%f2 : f32) outs(%1 : tensor<5xf32>) -> tensor<5xf32>
+
+  %selected = scf.if %c -> tensor<5xf32> {
+    scf.yield %filled : tensor<5xf32>
+  } else {
+    scf.yield %filled2 : tensor<5xf32>
+  }
+  bufferization.materialize_in_destination %selected in restrict writable %m : (tensor<5xf32>, memref<5xf32>) -> ()
+  return
+}
diff --git a/mlir/test/Dialect/Bufferization/invalid.mlir b/mlir/test/Dialect/Bufferization/invalid.mlir
index ce56f89c1f1bbe6..996d8430b84d48b 100644
--- a/mlir/test/Dialect/Bufferization/invalid.mlir
+++ b/mlir/test/Dialect/Bufferization/invalid.mlir
@@ -80,13 +80,6 @@ func.func @invalid_materialize_in_destination_dest_type(%arg0: tensor<5xf32>, %a
 
 // -----
 
-func.func @invalid_materialize_in_destination_restrict_missing(%arg0: tensor<?xf32>, %arg1: memref<?xf32>) {
-  // expected-error @below{{'restrict' must be specified if and only if the destination is of memref type}}
-  bufferization.materialize_in_destination %arg0 in %arg1 : (tensor<?xf32>, memref<?xf32>) -> ()
-}
-
-// -----
-
 func.func @invalid_materialize_in_destination_result(%arg0: tensor<?xf32>, %arg1: memref<?xf32>) {
   // expected-error @below{{memref 'dest' implies zero results}}
   bufferization.materialize_in_destination %arg0 in restrict %arg1 : (tensor<?xf32>, memref<?xf32>) -> (tensor<?xf32>)
@@ -102,7 +95,7 @@ func.func @invalid_materialize_in_destination_result_missing(%arg0: tensor<?xf32
 // -----
 
 func.func @invalid_materialize_in_destination_restrict(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) {
-  // expected-error @below{{'restrict' must be specified if and only if the destination is of memref type}}
+  // expected-error @below{{'restrict' is valid only for memref destinations}}
   bufferization.materialize_in_destination %arg0 in restrict %arg1 : (tensor<?xf32>, tensor<?xf32>) -> (tensor<?xf32>)
 }
 

>From 3eca61de46cde5fc9786c220aec0b9be7b89632c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval at gmail.com>
Date: Wed, 11 Oct 2023 09:08:05 -0700
Subject: [PATCH 17/38] [flang][openacc] Relax type check for private recipe on
 acc.serial (#68814)

The check was already relax on `acc.parallel` but not on `acc.serial`.
This patch makes is consistent.
---
 flang/test/Lower/OpenACC/acc-private.f90 | 7 ++++++-
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp  | 4 ++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/flang/test/Lower/OpenACC/acc-private.f90 b/flang/test/Lower/OpenACC/acc-private.f90
index 6adbad754d0bffa..10c1bfc7c3802a3 100644
--- a/flang/test/Lower/OpenACC/acc-private.f90
+++ b/flang/test/Lower/OpenACC/acc-private.f90
@@ -270,6 +270,10 @@ subroutine acc_private_allocatable_array(a, n)
   do i = 1, n
     a(i) = i
   end do
+
+  !$acc serial private(a)
+  a(i) = 1
+  !$acc end serial
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPacc_private_allocatable_array(
@@ -278,7 +282,8 @@ subroutine acc_private_allocatable_array(a, n)
 ! HLFIR: %[[BOX:.*]] = fir.load %[[DECLA_A]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! HLFIR: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
 ! HLFIR: %[[PRIVATE:.*]] = acc.private varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xi32>>) bounds(%{{.*}}) -> !fir.heap<!fir.array<?xi32>> {name = "a"}
-! HLFIR: acc.parallel private(@privatization_box_heap_Uxi32 -> %[[PRIVATE]] : !fir.heap<!fir.array<?xi32>>) {
+! HLFIR: acc.parallel private(@privatization_box_heap_Uxi32 -> %[[PRIVATE]] : !fir.heap<!fir.array<?xi32>>)
+! HLFIR: acc.serial private(@privatization_box_heap_Uxi32 -> %{{.*}} : !fir.heap<!fir.array<?xi32>>)
 
 subroutine acc_private_pointer_array(a, n)
   integer, pointer :: a(:)
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index b2ac4c232757b11..cea93b8a2ca8ceb 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -620,7 +620,7 @@ Value ParallelOp::getDataOperand(unsigned i) {
 LogicalResult acc::ParallelOp::verify() {
   if (failed(checkSymOperandList<mlir::acc::PrivateRecipeOp>(
           *this, getPrivatizations(), getGangPrivateOperands(), "private",
-          "privatizations", false)))
+          "privatizations", /*checkOperandType=*/false)))
     return failure();
   if (failed(checkSymOperandList<mlir::acc::ReductionRecipeOp>(
           *this, getReductionRecipes(), getReductionOperands(), "reduction",
@@ -650,7 +650,7 @@ Value SerialOp::getDataOperand(unsigned i) {
 LogicalResult acc::SerialOp::verify() {
   if (failed(checkSymOperandList<mlir::acc::PrivateRecipeOp>(
           *this, getPrivatizations(), getGangPrivateOperands(), "private",
-          "privatizations")))
+          "privatizations", /*checkOperandType=*/false)))
     return failure();
   if (failed(checkSymOperandList<mlir::acc::ReductionRecipeOp>(
           *this, getReductionRecipes(), getReductionOperands(), "reduction",

>From ebdec57274a9f572dbcadc713d63ea7c1178c9ea Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek at codeweavers.com>
Date: Wed, 11 Oct 2023 18:09:00 +0200
Subject: [PATCH 18/38] [lld] Don't allow -dynamicbase:no on ARM64EC.

---
 lld/COFF/Driver.cpp               | 2 +-
 lld/test/COFF/arm64-dynamicbase.s | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index c7f4e1f767a5431..278f5e71b14f543 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -2181,7 +2181,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
       args.hasFlag(OPT_highentropyva, OPT_highentropyva_no, true);
 
   if (!config->dynamicBase &&
-      (config->machine == ARMNT || config->machine == ARM64))
+      (config->machine == ARMNT || isAnyArm64(config->machine)))
     error("/dynamicbase:no is not compatible with " +
           machineToStr(config->machine));
 
diff --git a/lld/test/COFF/arm64-dynamicbase.s b/lld/test/COFF/arm64-dynamicbase.s
index d4cb12c1c8ffb9e..6ccce26a2f168a3 100644
--- a/lld/test/COFF/arm64-dynamicbase.s
+++ b/lld/test/COFF/arm64-dynamicbase.s
@@ -1,8 +1,16 @@
 // REQUIRES: aarch64
 // RUN: llvm-mc -filetype=obj -triple=aarch64-windows %s -o %t.obj
 // RUN: not lld-link -entry:_start -subsystem:console %t.obj -out:%t.exe -dynamicbase:no 2>&1 | FileCheck %s
+
+// RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %s -o %t.obj
+// RUN: not lld-link -entry:_start -subsystem:console %t.obj -out:%t.exe -dynamicbase:no -machine:arm64ec 2>&1 \
+// RUN:              | FileCheck %s -check-prefix=ARM64EC
+// RUN: not lld-link -entry:_start -subsystem:console %t.obj -out:%t.exe -dynamicbase:no -machine:arm64x -dll -noentry 2>&1 \
+// RUN:              | FileCheck %s -check-prefix=ARM64X
  .globl _start
 _start:
  ret
 
 # CHECK: dynamicbase:no is not compatible with arm64
+# ARM64EC: dynamicbase:no is not compatible with arm64ec
+# ARM64X: dynamicbase:no is not compatible with arm64x

>From f206ee139e2286d351026013366d3fc9ff5619cc Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik at users.noreply.github.com>
Date: Wed, 11 Oct 2023 09:15:07 -0700
Subject: [PATCH 19/38] [mlir][sparse] refactor dim2lvl/lvl2dim passing into
 MapRef (#68649)

This revision refactors all "swiss army knife" entry points to pass
dim2lvl/lvl2dim mapping, so that the callee can construct a MapRef
(shown for SparseTensorStorage class). This is a next step towards
completely centralizing mapping code into a single MapRef class.
---
 .../mlir/ExecutionEngine/SparseTensor/File.h  |   2 +-
 .../ExecutionEngine/SparseTensor/Storage.h    | 209 +++++++-----------
 .../ExecutionEngine/SparseTensor/Storage.cpp  |  11 +-
 .../ExecutionEngine/SparseTensorRuntime.cpp   |  21 +-
 4 files changed, 99 insertions(+), 144 deletions(-)

diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
index 9157bfa7e773239..efc3f82d6a307ea 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
@@ -203,7 +203,7 @@ class SparseTensorReader final {
     MapRef map(dimRank, lvlRank, dim2lvl, lvl2dim);
     auto *coo = readCOO<V>(map, lvlSizes);
     auto *tensor = SparseTensorStorage<P, I, V>::newFromCOO(
-        dimRank, getDimSizes(), lvlRank, lvlTypes, lvl2dim, *coo);
+        dimRank, getDimSizes(), lvlRank, lvlTypes, dim2lvl, lvl2dim, *coo);
     delete coo;
     return tensor;
   }
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
index 0407bccaae8790c..303a41bc471d5d9 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
@@ -24,13 +24,8 @@
 #include "mlir/ExecutionEngine/SparseTensor/ArithmeticUtils.h"
 #include "mlir/ExecutionEngine/SparseTensor/COO.h"
 #include "mlir/ExecutionEngine/SparseTensor/ErrorHandling.h"
+#include "mlir/ExecutionEngine/SparseTensor/MapRef.h"
 
-#define ASSERT_VALID_DIM(d)                                                    \
-  assert(d < getDimRank() && "Dimension is out of bounds");
-#define ASSERT_VALID_LVL(l)                                                    \
-  assert(l < getLvlRank() && "Level is out of bounds");
-#define ASSERT_COMPRESSED_LVL(l)                                               \
-  assert(isCompressedLvl(l) && "Level is not compressed");
 #define ASSERT_COMPRESSED_OR_SINGLETON_LVL(l)                                  \
   do {                                                                         \
     const DimLevelType dlt = getLvlType(l);                                    \
@@ -49,9 +44,9 @@ class SparseTensorEnumeratorBase;
 template <typename P, typename C, typename V>
 class SparseTensorEnumerator;
 
-/// Abstract base class for `SparseTensorStorage<P,C,V>`.  This class
+/// Abstract base class for `SparseTensorStorage<P,C,V>`. This class
 /// takes responsibility for all the `<P,C,V>`-independent aspects
-/// of the tensor (e.g., shape, sparsity, permutation).  In addition,
+/// of the tensor (e.g., shape, sparsity, mapping). In addition,
 /// we use function overloading to implement "partial" method
 /// specialization, which the C-API relies on to catch type errors
 /// arising from our use of opaque pointers.
@@ -62,24 +57,20 @@ class SparseTensorEnumerator;
 /// coordinate spaces (and their associated rank, shape, sizes, etc).
 /// Denotationally, we have the *dimensions* of the tensor represented
 /// by this object.  Operationally, we have the *levels* of the storage
-/// representation itself.  We use this "dimension" vs "level" terminology
-/// throughout, since alternative terminology like "tensor-dimension",
-/// "original-dimension", "storage-dimension", etc, is both more verbose
-/// and prone to introduce confusion whenever the qualifiers are dropped.
-/// Where necessary, we use "axis" as the generic term.
+/// representation itself.
 ///
 /// The *size* of an axis is the cardinality of possible coordinate
 /// values along that axis (regardless of which coordinates have stored
-/// element values).  As such, each size must be non-zero since if any
+/// element values). As such, each size must be non-zero since if any
 /// axis has size-zero then the whole tensor would have trivial storage
-/// (since there are no possible coordinates).  Thus we use the plural
+/// (since there are no possible coordinates). Thus we use the plural
 /// term *sizes* for a collection of non-zero cardinalities, and use
-/// this term whenever referring to run-time cardinalities.  Whereas we
+/// this term whenever referring to run-time cardinalities. Whereas we
 /// use the term *shape* for a collection of compile-time cardinalities,
 /// where zero is used to indicate cardinalities which are dynamic (i.e.,
-/// unknown/unspecified at compile-time).  At run-time, these dynamic
+/// unknown/unspecified at compile-time). At run-time, these dynamic
 /// cardinalities will be inferred from or checked against sizes otherwise
-/// specified.  Thus, dynamic cardinalities always have an "immutable but
+/// specified. Thus, dynamic cardinalities always have an "immutable but
 /// unknown" value; so the term "dynamic" should not be taken to indicate
 /// run-time mutability.
 class SparseTensorStorageBase {
@@ -89,25 +80,10 @@ class SparseTensorStorageBase {
 
 public:
   /// Constructs a new sparse-tensor storage object with the given encoding.
-  ///
-  /// Preconditions:
-  /// * `dimSizes`, `lvlSizes`, `lvlTypes`, and `lvl2dim` must be nonnull.
-  /// * `dimSizes` must be valid for `dimRank`.
-  /// * `lvlSizes`, `lvlTypes`, and `lvl2dim` must be valid for `lvlRank`.
-  /// * `lvl2dim` must map `lvlSizes`-coordinates to `dimSizes`-coordinates.
-  ///
-  /// Asserts:
-  /// * `dimRank` and `lvlRank` are nonzero.
-  /// * `dimSizes` and `lvlSizes` contain only nonzero sizes.
   SparseTensorStorageBase(uint64_t dimRank, const uint64_t *dimSizes,
                           uint64_t lvlRank, const uint64_t *lvlSizes,
-                          const DimLevelType *lvlTypes,
+                          const DimLevelType *lvlTypes, const uint64_t *dim2lvl,
                           const uint64_t *lvl2dim);
-  // NOTE: For the most part we only need the `dimRank`.  But we need
-  // `dimSizes` for `toCOO` to support the identity permutation nicely
-  // (i.e., without the caller needing to already know the tensor's
-  // dimension-sizes; e.g., as in `fromMLIRSparseTensor`).
-
   virtual ~SparseTensorStorageBase() = default;
 
   /// Gets the number of tensor-dimensions.
@@ -121,7 +97,7 @@ class SparseTensorStorageBase {
 
   /// Safely looks up the size of the given tensor-dimension.
   uint64_t getDimSize(uint64_t d) const {
-    ASSERT_VALID_DIM(d);
+    assert(d < getDimRank() && "Dimension is out of bounds");
     return dimSizes[d];
   }
 
@@ -130,19 +106,16 @@ class SparseTensorStorageBase {
 
   /// Safely looks up the size of the given storage-level.
   uint64_t getLvlSize(uint64_t l) const {
-    ASSERT_VALID_LVL(l);
+    assert(l < getLvlRank() && "Level is out of bounds");
     return lvlSizes[l];
   }
 
-  /// Gets the level-to-dimension mapping.
-  const std::vector<uint64_t> &getLvl2Dim() const { return lvl2dim; }
-
   /// Gets the level-types array.
   const std::vector<DimLevelType> &getLvlTypes() const { return lvlTypes; }
 
   /// Safely looks up the type of the given level.
   DimLevelType getLvlType(uint64_t l) const {
-    ASSERT_VALID_LVL(l);
+    assert(l < getLvlRank() && "Level is out of bounds");
     return lvlTypes[l];
   }
 
@@ -165,6 +138,10 @@ class SparseTensorStorageBase {
   /// Safely checks if the level is unique.
   bool isUniqueLvl(uint64_t l) const { return isUniqueDLT(getLvlType(l)); }
 
+  /// Gets the level-to-dimension mapping.
+  // TODO: REMOVE THIS
+  const std::vector<uint64_t> &getLvl2Dim() const { return lvl2dimVec; }
+
   /// Allocates a new enumerator.  Callers must make sure to delete
   /// the enumerator when they're done with it. The first argument
   /// is the out-parameter for storing the newly allocated enumerator;
@@ -228,12 +205,14 @@ class SparseTensorStorageBase {
   const std::vector<uint64_t> dimSizes;
   const std::vector<uint64_t> lvlSizes;
   const std::vector<DimLevelType> lvlTypes;
-  const std::vector<uint64_t> lvl2dim;
+  const std::vector<uint64_t> dim2lvlVec;
+  const std::vector<uint64_t> lvl2dimVec;
+  const MapRef map; // non-owning pointers into dim2lvl/lvl2dim vectors
 };
 
 /// A memory-resident sparse tensor using a storage scheme based on
-/// per-level sparse/dense annotations.  This data structure provides
-/// a bufferized form of a sparse tensor type.  In contrast to generating
+/// per-level sparse/dense annotations. This data structure provides
+/// a bufferized form of a sparse tensor type. In contrast to generating
 /// setup methods for each differently annotated sparse tensor, this
 /// method provides a convenient "one-size-fits-all" solution that simply
 /// takes an input tensor and annotations to implement all required setup
@@ -244,58 +223,45 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   /// Beware that the object is not necessarily guaranteed to be in a
   /// valid state after this constructor alone; e.g., `isCompressedLvl(l)`
   /// doesn't entail `!(positions[l].empty())`.
-  ///
-  /// Preconditions/assertions are as per the `SparseTensorStorageBase` ctor.
   SparseTensorStorage(uint64_t dimRank, const uint64_t *dimSizes,
                       uint64_t lvlRank, const uint64_t *lvlSizes,
-                      const DimLevelType *lvlTypes, const uint64_t *lvl2dim)
+                      const DimLevelType *lvlTypes, const uint64_t *dim2lvl,
+                      const uint64_t *lvl2dim)
       : SparseTensorStorageBase(dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes,
-                                lvl2dim),
+                                dim2lvl, lvl2dim),
         positions(lvlRank), coordinates(lvlRank), lvlCursor(lvlRank) {}
 
 public:
   /// Constructs a sparse tensor with the given encoding, and allocates
-  /// overhead storage according to some simple heuristics.  When the
+  /// overhead storage according to some simple heuristics. When the
   /// `bool` argument is true and `lvlTypes` are all dense, then this
-  /// ctor will also initialize the values array with zeros.  That
+  /// ctor will also initialize the values array with zeros. That
   /// argument should be true when an empty tensor is intended; whereas
   /// it should usually be false when the ctor will be followed up by
   /// some other form of initialization.
-  ///
-  /// Preconditions/assertions are as per the `SparseTensorStorageBase` ctor.
   SparseTensorStorage(uint64_t dimRank, const uint64_t *dimSizes,
                       uint64_t lvlRank, const uint64_t *lvlSizes,
-                      const DimLevelType *lvlTypes, const uint64_t *lvl2dim,
-                      bool initializeValuesIfAllDense);
+                      const DimLevelType *lvlTypes, const uint64_t *dim2lvl,
+                      const uint64_t *lvl2dim, bool initializeValuesIfAllDense);
 
   /// Constructs a sparse tensor with the given encoding, and initializes
-  /// the contents from the COO.  This ctor performs the same heuristic
-  /// overhead-storage allocation as the ctor taking a `bool`, and
-  /// has the same preconditions/assertions (where we define `lvlSizes =
-  /// lvlCOO.getDimSizes().data()`), with the following addition:
-  ///
-  /// Asserts:
-  /// * `lvlRank == lvlCOO.getRank()`.
+  /// the contents from the COO. This ctor performs the same heuristic
+  /// overhead-storage allocation as the ctor taking a `bool`.
   SparseTensorStorage(uint64_t dimRank, const uint64_t *dimSizes,
                       uint64_t lvlRank, const DimLevelType *lvlTypes,
-                      const uint64_t *lvl2dim, SparseTensorCOO<V> &lvlCOO);
+                      const uint64_t *dim2lvl, const uint64_t *lvl2dim,
+                      SparseTensorCOO<V> &lvlCOO);
 
   /// Constructs a sparse tensor with the given encoding, and initializes
-  /// the contents from the enumerator.  This ctor allocates exactly
+  /// the contents from the enumerator. This ctor allocates exactly
   /// the required amount of overhead storage, not using any heuristics.
-  /// Preconditions/assertions are as per the `SparseTensorStorageBase`
-  /// ctor (where we define `lvlSizes = lvlEnumerator.getTrgSizes().data()`),
-  /// with the following addition:
-  ///
-  /// Asserts:
-  /// * `lvlRank == lvlEnumerator.getTrgRank()`.
   SparseTensorStorage(uint64_t dimRank, const uint64_t *dimSizes,
                       uint64_t lvlRank, const DimLevelType *lvlTypes,
-                      const uint64_t *lvl2dim,
+                      const uint64_t *dim2lvl, const uint64_t *lvl2dim,
                       SparseTensorEnumeratorBase<V> &lvlEnumerator);
 
   /// Constructs a sparse tensor with the given encoding, and initializes
-  /// the contents from the level buffers.  This ctor allocates exactly
+  /// the contents from the level buffers. This ctor allocates exactly
   /// the required amount of overhead storage, not using any heuristics.
   /// It assumes that the data provided by `lvlBufs` can be directly used to
   /// interpret the result sparse tensor and performs *NO* integrity test on the
@@ -303,8 +269,8 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   /// passed in as a single AoS memory.
   SparseTensorStorage(uint64_t dimRank, const uint64_t *dimSizes,
                       uint64_t lvlRank, const uint64_t *lvlSizes,
-                      const DimLevelType *lvlTypes, const uint64_t *lvl2dim,
-                      const intptr_t *lvlBufs);
+                      const DimLevelType *lvlTypes, const uint64_t *dim2lvl,
+                      const uint64_t *lvl2dim, const intptr_t *lvlBufs);
 
   /// Allocates a new empty sparse tensor. The preconditions/assertions
   /// are as per the `SparseTensorStorageBase` ctor; which is to say,
@@ -313,21 +279,15 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   static SparseTensorStorage<P, C, V> *
   newEmpty(uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
            const uint64_t *lvlSizes, const DimLevelType *lvlTypes,
-           const uint64_t *lvl2dim) {
-    return new SparseTensorStorage<P, C, V>(dimRank, dimSizes, lvlRank,
-                                            lvlSizes, lvlTypes, lvl2dim, true);
+           const uint64_t *dim2lvl, const uint64_t *lvl2dim) {
+    return new SparseTensorStorage<P, C, V>(
+        dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim, true);
   }
 
   /// Allocates a new sparse tensor and initializes it from the given COO.
   /// The preconditions are as per the `SparseTensorStorageBase` ctor
   /// (where we define `lvlSizes = lvlCOO.getDimSizes().data()`), but
   /// using the following assertions in lieu of the base ctor's assertions:
-  ///
-  /// Asserts:
-  /// * `dimRank` and `lvlRank` are nonzero.
-  /// * `lvlRank == lvlCOO.getRank()`.
-  /// * `lvlCOO.getDimSizes()` under the `lvl2dim` mapping is a refinement
-  ///   of `dimShape`.
   //
   // TODO: The ability to reconstruct dynamic dimensions-sizes does not
   // easily generalize to arbitrary `lvl2dim` mappings.  When compiling
@@ -338,8 +298,8 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   // to update the type/preconditions of this factory too.
   static SparseTensorStorage<P, C, V> *
   newFromCOO(uint64_t dimRank, const uint64_t *dimShape, uint64_t lvlRank,
-             const DimLevelType *lvlTypes, const uint64_t *lvl2dim,
-             SparseTensorCOO<V> &lvlCOO);
+             const DimLevelType *lvlTypes, const uint64_t *dim2lvl,
+             const uint64_t *lvl2dim, SparseTensorCOO<V> &lvlCOO);
 
   /// Allocates a new sparse tensor and initializes it with the contents
   /// of another sparse tensor.
@@ -370,8 +330,9 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   static SparseTensorStorage<P, C, V> *
   newFromSparseTensor(uint64_t dimRank, const uint64_t *dimShape,
                       uint64_t lvlRank, const uint64_t *lvlSizes,
-                      const DimLevelType *lvlTypes, const uint64_t *lvl2dim,
-                      uint64_t srcRank, const uint64_t *src2lvl,
+                      const DimLevelType *lvlTypes,
+                      const uint64_t *src2lvl, // FIXME: dim2lvl,
+                      const uint64_t *lvl2dim, uint64_t srcRank,
                       const SparseTensorStorageBase &source);
 
   /// Allocates a new sparse tensor and initialize it with the data stored level
@@ -380,24 +341,23 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   /// Precondition:
   /// * as per the `SparseTensorStorageBase` ctor.
   /// * the data integrity stored in `buffers` is guaranteed by users already.
-  static SparseTensorStorage<P, C, V> *
-  packFromLvlBuffers(uint64_t dimRank, const uint64_t *dimShape,
-                     uint64_t lvlRank, const uint64_t *lvlSizes,
-                     const DimLevelType *lvlTypes, const uint64_t *lvl2dim,
-                     uint64_t srcRank, const uint64_t *src2lvl,
-                     const intptr_t *buffers);
+  static SparseTensorStorage<P, C, V> *packFromLvlBuffers(
+      uint64_t dimRank, const uint64_t *dimShape, uint64_t lvlRank,
+      const uint64_t *lvlSizes, const DimLevelType *lvlTypes,
+      const uint64_t *src2lvl, // FIXME: dim2lvl
+      const uint64_t *lvl2dim, uint64_t srcRank, const intptr_t *buffers);
 
   ~SparseTensorStorage() final = default;
 
   /// Partially specialize these getter methods based on template types.
   void getPositions(std::vector<P> **out, uint64_t lvl) final {
     assert(out && "Received nullptr for out parameter");
-    ASSERT_VALID_LVL(lvl);
+    assert(lvl < getLvlRank() && "Level is out of bounds");
     *out = &positions[lvl];
   }
   void getCoordinates(std::vector<C> **out, uint64_t lvl) final {
     assert(out && "Received nullptr for out parameter");
-    ASSERT_VALID_LVL(lvl);
+    assert(lvl < getLvlRank() && "Level is out of bounds");
     *out = &coordinates[lvl];
   }
   void getValues(std::vector<V> **out) final {
@@ -477,12 +437,12 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
 
   /// Allocates a new COO object and initializes it with the contents
   /// of this tensor under the given mapping from the `getDimSizes()`
-  /// coordinate-space to the `trgSizes` coordinate-space.  Callers must
+  /// coordinate-space to the `trgSizes` coordinate-space. Callers must
   /// make sure to delete the COO when they're done with it.
-  ///
-  /// Preconditions/assertions are as per the `SparseTensorEnumerator` ctor.
   SparseTensorCOO<V> *toCOO(uint64_t trgRank, const uint64_t *trgSizes,
-                            uint64_t srcRank, const uint64_t *src2trg) const {
+                            uint64_t srcRank,
+                            const uint64_t *src2trg, // FIXME: dim2lvl
+                            const uint64_t *lvl2dim) const {
     // We inline `newEnumerator` to avoid virtual dispatch and allocation.
     // TODO: use MapRef here too for the translation
     SparseTensorEnumerator<P, C, V> enumerator(*this, trgRank, trgSizes,
@@ -503,7 +463,7 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
   /// does not check that `pos` is semantically valid (i.e., larger than
   /// the previous position and smaller than `coordinates[lvl].capacity()`).
   void appendPos(uint64_t lvl, uint64_t pos, uint64_t count = 1) {
-    ASSERT_COMPRESSED_LVL(lvl);
+    assert(isCompressedLvl(lvl) && "Level is not compressed");
     positions[lvl].insert(positions[lvl].end(), count,
                           detail::checkOverflowCast<P>(pos));
   }
@@ -689,9 +649,6 @@ class SparseTensorStorage final : public SparseTensorStorageBase {
 };
 
 #undef ASSERT_COMPRESSED_OR_SINGLETON_LVL
-#undef ASSERT_COMPRESSED_LVL
-#undef ASSERT_VALID_LVL
-#undef ASSERT_VALID_DIM
 
 //===----------------------------------------------------------------------===//
 /// A (higher-order) function object for enumerating the elements of some
@@ -934,11 +891,12 @@ class SparseTensorNNZ final {
 //===----------------------------------------------------------------------===//
 // Definitions of the ctors and factories of `SparseTensorStorage<P,C,V>`.
 
+// TODO: MapRef
 template <typename P, typename C, typename V>
 SparseTensorStorage<P, C, V> *SparseTensorStorage<P, C, V>::newFromCOO(
     uint64_t dimRank, const uint64_t *dimShape, uint64_t lvlRank,
-    const DimLevelType *lvlTypes, const uint64_t *lvl2dim,
-    SparseTensorCOO<V> &lvlCOO) {
+    const DimLevelType *lvlTypes, const uint64_t *dim2lvl,
+    const uint64_t *lvl2dim, SparseTensorCOO<V> &lvlCOO) {
   assert(dimShape && "Got nullptr for dimension shape");
   assert(lvl2dim && "Got nullptr for level-to-dimension mapping");
   const auto &lvlSizes = lvlCOO.getDimSizes();
@@ -955,14 +913,15 @@ SparseTensorStorage<P, C, V> *SparseTensorStorage<P, C, V>::newFromCOO(
     dimSizes[d] = lvlSizes[l];
   }
   return new SparseTensorStorage<P, C, V>(dimRank, dimSizes.data(), lvlRank,
-                                          lvlTypes, lvl2dim, lvlCOO);
+                                          lvlTypes, dim2lvl, lvl2dim, lvlCOO);
 }
 
 template <typename P, typename C, typename V>
 SparseTensorStorage<P, C, V> *SparseTensorStorage<P, C, V>::newFromSparseTensor(
     uint64_t dimRank, const uint64_t *dimShape, uint64_t lvlRank,
     const uint64_t *lvlSizes, const DimLevelType *lvlTypes,
-    const uint64_t *lvl2dim, uint64_t srcRank, const uint64_t *src2lvl,
+    const uint64_t *src2lvl, // dim2lvl
+    const uint64_t *lvl2dim, uint64_t srcRank,
     const SparseTensorStorageBase &source) {
   // Verify that the `source` dimensions match the expected `dimShape`.
   assert(dimShape && "Got nullptr for dimension shape");
@@ -977,8 +936,9 @@ SparseTensorStorage<P, C, V> *SparseTensorStorage<P, C, V>::newFromSparseTensor(
 #endif
   SparseTensorEnumeratorBase<V> *lvlEnumerator;
   source.newEnumerator(&lvlEnumerator, lvlRank, lvlSizes, srcRank, src2lvl);
-  auto *tensor = new SparseTensorStorage<P, C, V>(
-      dimRank, dimSizes.data(), lvlRank, lvlTypes, lvl2dim, *lvlEnumerator);
+  auto *tensor = new SparseTensorStorage<P, C, V>(dimRank, dimSizes.data(),
+                                                  lvlRank, lvlTypes, src2lvl,
+                                                  lvl2dim, *lvlEnumerator);
   delete lvlEnumerator;
   return tensor;
 }
@@ -987,11 +947,12 @@ template <typename P, typename C, typename V>
 SparseTensorStorage<P, C, V> *SparseTensorStorage<P, C, V>::packFromLvlBuffers(
     uint64_t dimRank, const uint64_t *dimShape, uint64_t lvlRank,
     const uint64_t *lvlSizes, const DimLevelType *lvlTypes,
-    const uint64_t *lvl2dim, uint64_t srcRank, const uint64_t *src2lvl,
-    const intptr_t *buffers) {
+    const uint64_t *src2lvl, // FIXME: dim2lvl
+    const uint64_t *lvl2dim, uint64_t srcRank, const intptr_t *buffers) {
   assert(dimShape && "Got nullptr for dimension shape");
-  auto *tensor = new SparseTensorStorage<P, C, V>(
-      dimRank, dimShape, lvlRank, lvlSizes, lvlTypes, lvl2dim, buffers);
+  auto *tensor =
+      new SparseTensorStorage<P, C, V>(dimRank, dimShape, lvlRank, lvlSizes,
+                                       lvlTypes, src2lvl, lvl2dim, buffers);
   return tensor;
 }
 
@@ -999,9 +960,10 @@ template <typename P, typename C, typename V>
 SparseTensorStorage<P, C, V>::SparseTensorStorage(
     uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
     const uint64_t *lvlSizes, const DimLevelType *lvlTypes,
-    const uint64_t *lvl2dim, bool initializeValuesIfAllDense)
+    const uint64_t *dim2lvl, const uint64_t *lvl2dim,
+    bool initializeValuesIfAllDense)
     : SparseTensorStorage(dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes,
-                          lvl2dim) {
+                          dim2lvl, lvl2dim) {
   // Provide hints on capacity of positions and coordinates.
   // TODO: needs much fine-tuning based on actual sparsity; currently
   // we reserve position/coordinate space based on all previous dense
@@ -1012,8 +974,6 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
   for (uint64_t l = 0; l < lvlRank; ++l) {
     const DimLevelType dlt = lvlTypes[l]; // Avoid redundant bounds checking.
     if (isCompressedDLT(dlt)) {
-      // TODO: Take a parameter between 1 and `lvlSizes[l]`, and multiply
-      // `sz` by that before reserving. (For now we just use 1.)
       positions[l].reserve(sz + 1);
       positions[l].push_back(0);
       coordinates[l].reserve(sz);
@@ -1035,11 +995,11 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
 template <typename P, typename C, typename V>
 SparseTensorStorage<P, C, V>::SparseTensorStorage( // NOLINT
     uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
-    const DimLevelType *lvlTypes, const uint64_t *lvl2dim,
-    SparseTensorCOO<V> &lvlCOO)
+    const DimLevelType *lvlTypes, const uint64_t *dim2lvl,
+    const uint64_t *lvl2dim, SparseTensorCOO<V> &lvlCOO)
     : SparseTensorStorage(dimRank, dimSizes, lvlRank,
-                          lvlCOO.getDimSizes().data(), lvlTypes, lvl2dim,
-                          false) {
+                          lvlCOO.getDimSizes().data(), lvlTypes, dim2lvl,
+                          lvl2dim, false) {
   assert(lvlRank == lvlCOO.getDimSizes().size() && "Level-rank mismatch");
   // Ensure the preconditions of `fromCOO`.  (One is already ensured by
   // using `lvlSizes = lvlCOO.getDimSizes()` in the ctor above.)
@@ -1054,10 +1014,10 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage( // NOLINT
 template <typename P, typename C, typename V>
 SparseTensorStorage<P, C, V>::SparseTensorStorage(
     uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
-    const DimLevelType *lvlTypes, const uint64_t *lvl2dim,
-    SparseTensorEnumeratorBase<V> &lvlEnumerator)
+    const DimLevelType *lvlTypes, const uint64_t *dim2lvl,
+    const uint64_t *lvl2dim, SparseTensorEnumeratorBase<V> &lvlEnumerator)
     : SparseTensorStorage(dimRank, dimSizes, lvlRank,
-                          lvlEnumerator.getTrgSizes().data(), lvlTypes,
+                          lvlEnumerator.getTrgSizes().data(), lvlTypes, dim2lvl,
                           lvl2dim) {
   assert(lvlRank == lvlEnumerator.getTrgRank() && "Level-rank mismatch");
   {
@@ -1137,7 +1097,6 @@ SparseTensorStorage<P, C, V>::SparseTensorStorage(
       // Can't check all of them, but at least we can check the last one.
       assert(positions[l][parentSz - 1] == positions[l][parentSz] &&
              "Positions got corrupted");
-      // TODO: optimize this by using `memmove` or similar.
       for (uint64_t n = 0; n < parentSz; ++n) {
         const uint64_t parentPos = parentSz - n;
         positions[l][parentPos] = positions[l][parentPos - 1];
@@ -1157,9 +1116,9 @@ template <typename P, typename C, typename V>
 SparseTensorStorage<P, C, V>::SparseTensorStorage(
     uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
     const uint64_t *lvlSizes, const DimLevelType *lvlTypes,
-    const uint64_t *lvl2dim, const intptr_t *lvlBufs)
+    const uint64_t *dim2lvl, const uint64_t *lvl2dim, const intptr_t *lvlBufs)
     : SparseTensorStorage(dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes,
-                          lvl2dim) {
+                          dim2lvl, lvl2dim) {
   uint64_t trailCOOLen = 0, parentSz = 1, bufIdx = 0;
   for (uint64_t l = 0; l < lvlRank; l++) {
     if (!isUniqueLvl(l) && isCompressedLvl(l)) {
diff --git a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
index 199e4205a61a25b..1d654cae3b4b125 100644
--- a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
@@ -20,15 +20,14 @@ using namespace mlir::sparse_tensor;
 SparseTensorStorageBase::SparseTensorStorageBase( // NOLINT
     uint64_t dimRank, const uint64_t *dimSizes, uint64_t lvlRank,
     const uint64_t *lvlSizes, const DimLevelType *lvlTypes,
-    const uint64_t *lvl2dim)
+    const uint64_t *dim2lvl, const uint64_t *lvl2dim)
     : dimSizes(dimSizes, dimSizes + dimRank),
       lvlSizes(lvlSizes, lvlSizes + lvlRank),
       lvlTypes(lvlTypes, lvlTypes + lvlRank),
-      lvl2dim(lvl2dim, lvl2dim + lvlRank) {
-  assert(dimSizes && "Got nullptr for dimension sizes");
-  assert(lvlSizes && "Got nullptr for level sizes");
-  assert(lvlTypes && "Got nullptr for level types");
-  assert(lvl2dim && "Got nullptr for level-to-dimension mapping");
+      dim2lvlVec(dim2lvl, dim2lvl + dimRank),
+      lvl2dimVec(lvl2dim, lvl2dim + lvlRank),
+      map(dimRank, lvlRank, dim2lvlVec.data(), lvl2dimVec.data()) {
+  assert(dimSizes && lvlSizes && lvlTypes && dim2lvl && lvl2dim);
   // Validate dim-indexed parameters.
   assert(dimRank > 0 && "Trivial shape is unsupported");
   for (uint64_t d = 0; d < dimRank; ++d)
diff --git a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
index 8340fe7dcf925be..bc6d4ad2c740189 100644
--- a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
@@ -179,39 +179,39 @@ extern "C" {
     switch (action) {                                                          \
     case Action::kEmpty:                                                       \
       return SparseTensorStorage<P, C, V>::newEmpty(                           \
-          dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, lvl2dim);            \
+          dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim);   \
     case Action::kFromCOO: {                                                   \
       assert(ptr && "Received nullptr for SparseTensorCOO object");            \
       auto &coo = *static_cast<SparseTensorCOO<V> *>(ptr);                     \
       return SparseTensorStorage<P, C, V>::newFromCOO(                         \
-          dimRank, dimSizes, lvlRank, lvlTypes, lvl2dim, coo);                 \
+          dimRank, dimSizes, lvlRank, lvlTypes, dim2lvl, lvl2dim, coo);        \
     }                                                                          \
     case Action::kSparseToSparse: {                                            \
       assert(ptr && "Received nullptr for SparseTensorStorage object");        \
       auto &tensor = *static_cast<SparseTensorStorageBase *>(ptr);             \
       return SparseTensorStorage<P, C, V>::newFromSparseTensor(                \
-          dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, lvl2dim, dimRank,    \
-          dim2lvl, tensor);                                                    \
+          dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim,    \
+          dimRank, tensor);                                                    \
     }                                                                          \
     case Action::kEmptyCOO:                                                    \
       return new SparseTensorCOO<V>(lvlRank, lvlSizes);                        \
     case Action::kToCOO: {                                                     \
       assert(ptr && "Received nullptr for SparseTensorStorage object");        \
       auto &tensor = *static_cast<SparseTensorStorage<P, C, V> *>(ptr);        \
-      return tensor.toCOO(lvlRank, lvlSizes, dimRank, dim2lvl);                \
+      return tensor.toCOO(lvlRank, lvlSizes, dimRank, dim2lvl, lvl2dim);       \
     }                                                                          \
     case Action::kToIterator: {                                                \
       assert(ptr && "Received nullptr for SparseTensorStorage object");        \
       auto &tensor = *static_cast<SparseTensorStorage<P, C, V> *>(ptr);        \
-      auto *coo = tensor.toCOO(lvlRank, lvlSizes, dimRank, dim2lvl);           \
+      auto *coo = tensor.toCOO(lvlRank, lvlSizes, dimRank, dim2lvl, lvl2dim);  \
       return new SparseTensorIterator<V>(coo);                                 \
     }                                                                          \
     case Action::kPack: {                                                      \
       assert(ptr && "Received nullptr for SparseTensorStorage object");        \
       intptr_t *buffers = static_cast<intptr_t *>(ptr);                        \
       return SparseTensorStorage<P, C, V>::packFromLvlBuffers(                 \
-          dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, lvl2dim, dimRank,    \
-          dim2lvl, buffers);                                                   \
+          dimRank, dimSizes, lvlRank, lvlSizes, lvlTypes, dim2lvl, lvl2dim,    \
+          dimRank, buffers);                                                   \
     }                                                                          \
     }                                                                          \
     MLIR_SPARSETENSOR_FATAL("unknown action: %d\n",                            \
@@ -250,9 +250,6 @@ void *_mlir_ciface_newSparseTensor( // NOLINT
   const index_type *dim2lvl = MEMREF_GET_PAYLOAD(dim2lvlRef);
   const index_type *lvl2dim = MEMREF_GET_PAYLOAD(lvl2dimRef);
 
-  // Prepare map.
-  // TODO: start using MapRef map(dimRank, lvlRank, dim2lvl, lvl2dim) below
-
   // Rewrite kIndex to kU64, to avoid introducing a bunch of new cases.
   // This is safe because of the static_assert above.
   if (posTp == OverheadType::kIndex)
@@ -403,7 +400,7 @@ MLIR_SPARSETENSOR_FOREVERY_O(IMPL_SPARSECOORDINATES)
 #undef IMPL_GETOVERHEAD
 
 // TODO: use MapRef here for translation of coordinates
-// TOOD: remove dim2lvl
+// TODO: remove dim2lvl
 #define IMPL_ADDELT(VNAME, V)                                                  \
   void *_mlir_ciface_addElt##VNAME(                                            \
       void *lvlCOO, StridedMemRefType<V, 0> *vref,                             \

>From 3ced0dd4447a7d5e165dc9d9bfbc6905881d9b34 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Thu, 14 Sep 2023 15:01:08 +0000
Subject: [PATCH 20/38] [flang] use TBAAForest in TBAABuilder

This is important to ensure that tags end up in the same trees that were
created in the FIR TBAA pass. If they are in different trees then
everything in one tree will be assumed to MayAlias with everything in the
other tree. This leads to poor performance.

@vzakhari requested that the old (not-per-function) trees are
maintained so I left the old test intact.

PR: https://github.com/llvm/llvm-project/pull/68437
---
 .../flang/Optimizer/CodeGen/TBAABuilder.h     |  49 +++-----
 flang/lib/Optimizer/CodeGen/TBAABuilder.cpp   |  61 +++++-----
 flang/test/Fir/tbaa-codegen.fir               |  10 +-
 flang/test/Fir/tbaa-codegen2.fir              | 114 ++++++++++++++++++
 flang/test/Fir/tbaa.fir                       |  24 ++--
 5 files changed, 182 insertions(+), 76 deletions(-)
 create mode 100644 flang/test/Fir/tbaa-codegen2.fir

diff --git a/flang/include/flang/Optimizer/CodeGen/TBAABuilder.h b/flang/include/flang/Optimizer/CodeGen/TBAABuilder.h
index 3aeb124f911867e..5420e48146bbfae 100644
--- a/flang/include/flang/Optimizer/CodeGen/TBAABuilder.h
+++ b/flang/include/flang/Optimizer/CodeGen/TBAABuilder.h
@@ -13,8 +13,8 @@
 #ifndef FORTRAN_OPTIMIZER_CODEGEN_TBAABUILDER_H
 #define FORTRAN_OPTIMIZER_CODEGEN_TBAABUILDER_H
 
+#include "flang/Optimizer/Analysis/TBAAForest.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/IR/BuiltinAttributes.h"
 
 namespace fir {
 
@@ -25,9 +25,9 @@ namespace fir {
 //
 // TBAA type information is represented with LLVM::MetadataOp operation
 // with specific symbol name `TBAABuilder::tbaaMetaOpName`. The basic
-// TBAA tree used for Flang consists of the following nodes:
+// TBAA trees used for Flang consists of the following nodes:
 //   llvm.metadata @__flang_tbaa {
-//     llvm.tbaa_root @root_0 {id = "Flang Type TBAA Root"}
+//     llvm.tbaa_root @root_0 {id = "Flang Type TBAA Function Root funcName"}
 //     llvm.tbaa_type_desc @type_desc_1 {id = "any access",
 //                                       members = {<@root_0, 0>}}
 //     llvm.tbaa_type_desc @type_desc_2 {id = "any data access",
@@ -162,9 +162,14 @@ namespace fir {
 // Given the storage association, all non-box accesses are represented
 // with the conservative data access tag:
 //   < `<any data access>`, `<any data access>`, 0 >
+
+// additional tags are added in flang/Optimizer/Transforms/AddAliasTags.cpp
+// (before CodeGen)
 class TBAABuilder {
 public:
-  TBAABuilder(mlir::MLIRContext *context, bool applyTBAA);
+  /// if forceUnifiedTree is true, functions will not have different TBAA trees
+  TBAABuilder(mlir::MLIRContext *context, bool applyTBAA,
+              bool forceUnifiedTree = false);
   TBAABuilder(TBAABuilder const &) = delete;
   TBAABuilder &operator=(TBAABuilder const &) = delete;
 
@@ -184,13 +189,13 @@ class TBAABuilder {
 
   // Returns TBAATagAttr representing access tag:
   //   < <descriptor member>, <descriptor member>, 0 >
-  mlir::LLVM::TBAATagAttr getAnyBoxAccessTag();
+  mlir::LLVM::TBAATagAttr getAnyBoxAccessTag(mlir::LLVM::LLVMFuncOp func);
   // Returns TBAATagAttr representing access tag:
   //   < <any data access>, <any data access>, 0 >
-  mlir::LLVM::TBAATagAttr getAnyDataAccessTag();
+  mlir::LLVM::TBAATagAttr getAnyDataAccessTag(mlir::LLVM::LLVMFuncOp func);
   // Returns TBAATagAttr representing access tag:
   //   < <any access>, <any access>, 0 >
-  mlir::LLVM::TBAATagAttr getAnyAccessTag();
+  mlir::LLVM::TBAATagAttr getAnyAccessTag(mlir::LLVM::LLVMFuncOp func);
 
   // Returns TBAATagAttr representing access tag described by the base and
   // access FIR types and the LLVM::GepOp representing the access in terms of
@@ -198,7 +203,8 @@ class TBAABuilder {
   // fir::BaseBoxType.
   mlir::LLVM::TBAATagAttr getBoxAccessTag(mlir::Type baseFIRType,
                                           mlir::Type accessFIRType,
-                                          mlir::LLVM::GEPOp gep);
+                                          mlir::LLVM::GEPOp gep,
+                                          mlir::LLVM::LLVMFuncOp func);
 
   // Returns TBAATagAttr representing access tag described by the base and
   // access FIR types and the LLVM::GepOp representing the access in terms of
@@ -206,34 +212,13 @@ class TBAABuilder {
   // "data" access, i.e. not an access of any box/descriptor member.
   mlir::LLVM::TBAATagAttr getDataAccessTag(mlir::Type baseFIRType,
                                            mlir::Type accessFIRType,
-                                           mlir::LLVM::GEPOp gep);
+                                           mlir::LLVM::GEPOp gep,
+                                           mlir::LLVM::LLVMFuncOp func);
 
   // Set to true, if TBAA builder is active, otherwise, all public
   // methods are no-ops.
   bool enableTBAA;
 
-  // LLVM::TBAARootAttr identifying Flang's TBAA root.
-  mlir::LLVM::TBAARootAttr flangTBAARoot;
-  // Identity string for Flang's TBAA root.
-  static constexpr llvm::StringRef flangTBAARootId = "Flang Type TBAA Root";
-
-  // LLVM::TBAATypeDescriptorAttr identifying "any access".
-  mlir::LLVM::TBAATypeDescriptorAttr anyAccessTypeDesc;
-  // Identity string for "any access" type descriptor.
-  static constexpr llvm::StringRef anyAccessTypeDescId = "any access";
-
-  // LLVM::TBAATypeDescriptorAttr identifying "any data access" (i.e. non-box
-  // memory access).
-  mlir::LLVM::TBAATypeDescriptorAttr anyDataAccessTypeDesc;
-  // Identity string for "any data access" type descriptor.
-  static constexpr llvm::StringRef anyDataAccessTypeDescId = "any data access";
-
-  // LLVM::TBAATypeDescriptorAttr identifying "descriptor member" access, i.e.
-  // any access within the bounds of a box/descriptor.
-  mlir::LLVM::TBAATypeDescriptorAttr boxMemberTypeDesc;
-  // Identity string for "descriptor member" type descriptor.
-  static constexpr llvm::StringRef boxMemberTypeDescId = "descriptor member";
-
   // Number of attached TBAA tags (used for debugging).
   unsigned tagAttachmentCounter = 0;
 
@@ -247,6 +232,8 @@ class TBAABuilder {
       std::tuple<mlir::LLVM::TBAANodeAttr, mlir::LLVM::TBAANodeAttr, int64_t>,
       mlir::LLVM::TBAATagAttr>
       tagsMap;
+
+  TBAAForrest trees;
 };
 
 } // namespace fir
diff --git a/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp b/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp
index 1a5ae8cf7aac629..8e7f59f76383c96 100644
--- a/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp
+++ b/flang/lib/Optimizer/CodeGen/TBAABuilder.cpp
@@ -15,6 +15,9 @@
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include <mlir/Dialect/LLVMIR/LLVMAttrs.h>
+#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
+#include <mlir/Dialect/LLVMIR/LLVMTypes.h>
 
 #define DEBUG_TYPE "flang-tbaa-builder"
 
@@ -27,6 +30,13 @@ static llvm::cl::opt<bool> disableTBAA(
                    "to override default Flang behavior"),
     llvm::cl::init(false));
 
+// disabling this will play badly with the FIR TBAA pass, leading to worse
+// performance
+static llvm::cl::opt<bool> perFunctionTBAATrees(
+    "per-function-tbaa-trees",
+    llvm::cl::desc("Give each function an independent TBAA tree (default)"),
+    llvm::cl::init(true), llvm::cl::Hidden);
+
 // tagAttachmentLimit is a debugging option that allows limiting
 // the number of TBAA access tag attributes attached to operations.
 // It is set to kTagAttachmentUnlimited by default denoting "no limit".
@@ -38,27 +48,12 @@ static llvm::cl::opt<unsigned>
 
 namespace fir {
 
-TBAABuilder::TBAABuilder(MLIRContext *context, bool applyTBAA)
-    : enableTBAA(applyTBAA && !disableTBAA) {
+TBAABuilder::TBAABuilder(MLIRContext *context, bool applyTBAA,
+                         bool forceUnifiedTree)
+    : enableTBAA(applyTBAA && !disableTBAA),
+      trees(/*separatePerFunction=*/perFunctionTBAATrees && !forceUnifiedTree) {
   if (!enableTBAA)
     return;
-
-  // Root node.
-  flangTBAARoot =
-      TBAARootAttr::get(context, StringAttr::get(context, flangTBAARootId));
-
-  // Any access node.
-  anyAccessTypeDesc = TBAATypeDescriptorAttr::get(
-      context, anyAccessTypeDescId, TBAAMemberAttr::get(flangTBAARoot, 0));
-
-  // Any data access node.
-  anyDataAccessTypeDesc =
-      TBAATypeDescriptorAttr::get(context, anyDataAccessTypeDescId,
-                                  TBAAMemberAttr::get(anyAccessTypeDesc, 0));
-
-  // Box member access node.
-  boxMemberTypeDesc = TBAATypeDescriptorAttr::get(
-      context, boxMemberTypeDescId, TBAAMemberAttr::get(anyAccessTypeDesc, 0));
 }
 
 TBAATagAttr TBAABuilder::getAccessTag(TBAATypeDescriptorAttr baseTypeDesc,
@@ -73,26 +68,31 @@ TBAATagAttr TBAABuilder::getAccessTag(TBAATypeDescriptorAttr baseTypeDesc,
   return tag;
 }
 
-TBAATagAttr TBAABuilder::getAnyBoxAccessTag() {
+TBAATagAttr TBAABuilder::getAnyBoxAccessTag(mlir::LLVM::LLVMFuncOp func) {
+  TBAATypeDescriptorAttr boxMemberTypeDesc = trees[func].boxMemberTypeDesc;
   return getAccessTag(boxMemberTypeDesc, boxMemberTypeDesc, /*offset=*/0);
 }
 
 TBAATagAttr TBAABuilder::getBoxAccessTag(Type baseFIRType, Type accessFIRType,
-                                         GEPOp gep) {
-  return getAnyBoxAccessTag();
+                                         GEPOp gep,
+                                         mlir::LLVM::LLVMFuncOp func) {
+  return getAnyBoxAccessTag(func);
 }
 
-TBAATagAttr TBAABuilder::getAnyDataAccessTag() {
+TBAATagAttr TBAABuilder::getAnyDataAccessTag(mlir::LLVM::LLVMFuncOp func) {
+  TBAATypeDescriptorAttr anyDataAccessTypeDesc = trees[func].anyDataTypeDesc;
   return getAccessTag(anyDataAccessTypeDesc, anyDataAccessTypeDesc,
                       /*offset=*/0);
 }
 
 TBAATagAttr TBAABuilder::getDataAccessTag(Type baseFIRType, Type accessFIRType,
-                                          GEPOp gep) {
-  return getAnyDataAccessTag();
+                                          GEPOp gep,
+                                          mlir::LLVM::LLVMFuncOp func) {
+  return getAnyDataAccessTag(func);
 }
 
-TBAATagAttr TBAABuilder::getAnyAccessTag() {
+TBAATagAttr TBAABuilder::getAnyAccessTag(mlir::LLVM::LLVMFuncOp func) {
+  TBAATypeDescriptorAttr anyAccessTypeDesc = trees[func].anyAccessDesc;
   return getAccessTag(anyAccessTypeDesc, anyAccessTypeDesc, /*offset=*/0);
 }
 
@@ -101,6 +101,9 @@ void TBAABuilder::attachTBAATag(AliasAnalysisOpInterface op, Type baseFIRType,
   if (!enableTBAA)
     return;
 
+  mlir::LLVM::LLVMFuncOp func = op->getParentOfType<mlir::LLVM::LLVMFuncOp>();
+  assert(func && "func.func should have already been converted to llvm.func");
+
   ++tagAttachmentCounter;
   if (tagAttachmentLimit != kTagAttachmentUnlimited &&
       tagAttachmentCounter > tagAttachmentLimit)
@@ -115,11 +118,11 @@ void TBAABuilder::attachTBAATag(AliasAnalysisOpInterface op, Type baseFIRType,
     // a mix of data members and descriptor members may alias
     // with both data and descriptor accesses.
     // Conservatively set any-access tag if there is any descriptor member.
-    tbaaTagSym = getAnyAccessTag();
+    tbaaTagSym = getAnyAccessTag(func);
   } else if (baseFIRType.isa<fir::BaseBoxType>()) {
-    tbaaTagSym = getBoxAccessTag(baseFIRType, accessFIRType, gep);
+    tbaaTagSym = getBoxAccessTag(baseFIRType, accessFIRType, gep, func);
   } else {
-    tbaaTagSym = getDataAccessTag(baseFIRType, accessFIRType, gep);
+    tbaaTagSym = getDataAccessTag(baseFIRType, accessFIRType, gep, func);
   }
 
   if (!tbaaTagSym)
diff --git a/flang/test/Fir/tbaa-codegen.fir b/flang/test/Fir/tbaa-codegen.fir
index 386fe42eaaba9a2..7acd04da24c9cbf 100644
--- a/flang/test/Fir/tbaa-codegen.fir
+++ b/flang/test/Fir/tbaa-codegen.fir
@@ -11,7 +11,7 @@
 #tbaa_type_desc = #llvm.tbaa_type_desc<id = "any access", members = {<#tbaa_root, 0>}>
 #tbaa_type_desc1 = #llvm.tbaa_type_desc<id = "any data access", members = {<#tbaa_type_desc, 0>}>
 #tbaa_type_desc2 = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#tbaa_type_desc1, 0>}>
-#tbaa_type_desc3 = #llvm.tbaa_type_desc<id = "dummy arg data/a", members = {<#tbaa_type_desc2, 0>}>
+#tbaa_type_desc3 = #llvm.tbaa_type_desc<id = "dummy arg data/_QFfuncEa", members = {<#tbaa_type_desc2, 0>}>
 #tbaa_tag = #llvm.tbaa_tag<base_type = #tbaa_type_desc3, access_type = #tbaa_type_desc3, offset = 0>
 module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.target_triple = "aarch64-unknown-linux-gnu"} {
   func.func @_QPsimple(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) {
@@ -39,9 +39,9 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.targ
 // CHECK:  store i32 %[[A2]], ptr %[[A1]], align 4, !tbaa ![[A_ACCESS_TAG]]
 // CHECK:  ret void
 // CHECK: }
+// CHECK: ![[ANY_ACCESS_TYPE:.*]] = !{!"any access", ![[ROOT:.*]], i64 0}
+// CHECK: ![[ROOT]] = !{!"Flang function root _QPsimple"}
 // CHECK: ![[A_ACCESS_TAG]] = !{![[A_ACCESS_TYPE:.*]], ![[A_ACCESS_TYPE]], i64 0}
-// CHECK: ![[A_ACCESS_TYPE]] = !{!"dummy arg data/a", ![[DUMMY_ARG_TYPE:.*]], i64 0}
+// CHECK: ![[A_ACCESS_TYPE]] = !{!"dummy arg data/_QFfuncEa", ![[DUMMY_ARG_TYPE:.*]], i64 0}
 // CHECK: ![[DUMMY_ARG_TYPE]] = !{!"dummy arg data", ![[DATA_ACCESS_TYPE:.*]], i64 0}
-// CHECK: ![[DATA_ACCESS_TYPE]] = !{!"any data access", ![[ANY_ACCESS_TYPE:.*]], i64 0}
-// CHECK: ![[ANY_ACCESS_TYPE]] = !{!"any access", ![[ROOT:.*]], i64 0}
-// CHECK: ![[ROOT]] = !{!"Flang function root _QPsimple"}
\ No newline at end of file
+// CHECK: ![[DATA_ACCESS_TYPE]] = !{!"any data access", ![[ANY_ACCESS_TYPE]], i64 0}
diff --git a/flang/test/Fir/tbaa-codegen2.fir b/flang/test/Fir/tbaa-codegen2.fir
new file mode 100644
index 000000000000000..f79a6108fb41e80
--- /dev/null
+++ b/flang/test/Fir/tbaa-codegen2.fir
@@ -0,0 +1,114 @@
+// test that tbaa attributes can be added to fir.load and fir.store
+// and that these attributes are propagated to LLVMIR and that these
+// interoperrate with tbaa tags added during codegen
+
+// RUN: tco %s | FileCheck %s
+
+// subroutine func(a)
+//   integer, intent(inout) :: a(:)
+//   a = a+1
+//   a(1) = a(2)
+#tbaa_root = #llvm.tbaa_root<id = "Flang function root _QPfunc">
+#tbaa_type_desc = #llvm.tbaa_type_desc<id = "any access", members = {<#tbaa_root, 0>}>
+#tbaa_type_desc1 = #llvm.tbaa_type_desc<id = "any data access", members = {<#tbaa_type_desc, 0>}>
+#tbaa_type_desc2 = #llvm.tbaa_type_desc<id = "dummy arg data", members = {<#tbaa_type_desc1, 0>}>
+#tbaa_type_desc3 = #llvm.tbaa_type_desc<id = "dummy arg data/_QFfuncEa", members = {<#tbaa_type_desc2, 0>}>
+#tbaa_tag = #llvm.tbaa_tag<base_type = #tbaa_type_desc3, access_type = #tbaa_type_desc3, offset = 0>
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.target_triple = "aarch64-unknown-linux-gnu"} {
+  func.func @_QPfunc(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "a"}) {
+    %c3_i32 = arith.constant 3 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c0 = arith.constant 0 : index
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %0 = fir.alloca !fir.box<!fir.array<?xi32>>
+    %1 = fir.declare %arg0 {fortran_attrs = #fir.var_attrs<intent_inout>, uniq_name = "_QFfuncEa"} : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+    %2 = fir.rebox %1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+    %3:3 = fir.box_dims %2, %c0 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+    %4 = fir.shape %3#1 : (index) -> !fir.shape<1>
+    %5 = fir.allocmem !fir.array<?xi32>, %3#1 {bindc_name = ".tmp.array", uniq_name = ""}
+    %6 = fir.declare %5(%4) {uniq_name = ".tmp.array"} : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.heap<!fir.array<?xi32>>
+    %7 = fir.embox %6(%4) : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
+    fir.do_loop %arg1 = %c1 to %3#1 step %c1 unordered {
+      %16 = fir.array_coor %2 %arg1 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+      // load with tbaa
+      %17 = fir.load %16 {tbaa = [#tbaa_tag]} : !fir.ref<i32>
+      %18 = arith.addi %17, %c1_i32 : i32
+      %19 = fir.array_coor %6(%4) %arg1 : (!fir.heap<!fir.array<?xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+      // store without tbaa
+      fir.store %18 to %19 : !fir.ref<i32>
+    }
+    fir.store %2 to %0 : !fir.ref<!fir.box<!fir.array<?xi32>>>
+    %8 = fir.address_of(@_QQcl.2F746D702F73696D706C652E66393000) : !fir.ref<!fir.char<1,16>>
+    %9 = fir.convert %0 : (!fir.ref<!fir.box<!fir.array<?xi32>>>) -> !fir.ref<!fir.box<none>>
+    %10 = fir.convert %7 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
+    %11 = fir.convert %8 : (!fir.ref<!fir.char<1,16>>) -> !fir.ref<i8>
+    %12 = fir.call @_FortranAAssign(%9, %10, %11, %c3_i32) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+    fir.freemem %6 : !fir.heap<!fir.array<?xi32>>
+    %13 = fir.array_coor %2 %c2 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+    // load modified not to have tbaa
+    %14 = fir.load %13 : !fir.ref<i32>
+    %15 = fir.array_coor %2 %c1 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+    // store with tbaa
+    fir.store %14 to %15 {tbaa = [#tbaa_tag]} : !fir.ref<i32>
+    return
+  }
+  func.func private @_FortranAAssign(!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.ref<i8>, i32) -> none attributes {fir.runtime}
+  fir.global linkonce @_QQcl.2F746D702F73696D706C652E66393000 constant : !fir.char<1,16> {
+    %0 = fir.string_lit "/tmp/simple.f90\00"(16) : !fir.char<1,16>
+    fir.has_value %0 : !fir.char<1,16>
+  }
+}
+// CHECK-LABEL: define void @_QPfunc(
+// CHECK-SAME:      ptr %[[ARG0:.*]]) {
+// [...]
+// CHECK:  %[[VAL5:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ARG0]], i32 0, i32 7, i32 0, i32 0
+// box access:
+// CHECK:  %[[VAL6:.*]] = load i64, ptr %[[VAL5]], align 4, !tbaa ![[BOX_ACCESS_TAG:.*]]
+// CHECK:  %[[VAL7:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %0, i32 0, i32 7, i32 0, i32 1
+// box access:
+// CHECK:  %[[VAL8:.*]] = load i64, ptr %[[VAL7]], align 4, !tbaa ![[BOX_ACCESS_TAG]]
+// CHECK:  %[[VAL9:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ARG0]], i32 0, i32 7, i32 0, i32 2
+// box access:
+// CHECK:  %[[VAL10:.*]] = load i64, ptr %[[VAL9]], align 4, !tbaa ![[BOX_ACCESS_TAG]]
+// CHECK:  %[[VAL11:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[ARG0]], i32 0, i32 0
+// box access:
+// CHECK:  %[[VAL12:.*]] = load ptr, ptr %[[VAL11]], align 8, !tbaa ![[BOX_ACCESS_TAG]]
+// CHECK:  %[[VAL15:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %14, ptr %[[VAL12]], 0
+// CHECK:  store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[VAL15]], ptr %{{.*}}, align 8, !tbaa ![[BOX_ACCESS_TAG]]
+// CHECK:  %[[VAL16:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 7, i64 0, i32 0
+// box access:
+// CHECK:  %[[VAL17:.*]] = load i64, ptr %[[VAL16]], align 4, !tbaa ![[BOX_ACCESS_TAG]]
+// CHECK:  %[[VAL18:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 7, i64 0, i32 1
+// box access:
+// CHECK:  %[[VAL19:.*]] = load i64, ptr %[[VAL18]], align 4, !tbaa ![[BOX_ACCESS_TAG]]
+// CHECK:  %[[VAL20:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 7, i64 0, i32 2
+// box access:
+// CHECK:  %[[VAL21:.*]] = load i64, ptr %[[VAL20]], align 4, !tbaa ![[BOX_ACCESS_TAG]]
+// [...]
+// box access:
+// CHECK:  store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %{{.*}}, ptr %{{.*}}, align 8, !tbaa ![[BOX_ACCESS_TAG]]
+// [...]
+
+// [...]
+// CHECK:  %[[VAL40:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %{{.*}}, i32 0, i32 0
+// box access:
+// CHECK:  %[[VAL41:.*]] = load ptr, ptr %[[VAL40]], align 8, !tbaa ![[BOX_ACCESS_TAG]]
+// CHECK:  %[[VAL42:.*]] = getelementptr i8, ptr %[[VAL41]], i64 %{{.*}}
+// access to 'a':
+// CHECK:  %[[VAL43:.*]] = load i32, ptr %[[VAL42]], align 4, !tbaa ![[A_ACCESS_TAG:.*]]
+// [...]
+// CHECK:  %[[VAL50:.*]] = getelementptr i32, ptr %{{.*}}, i64 %{{.*}}
+// store to the temporary:
+// CHECK:  store i32 %{{.*}}, ptr %[[VAL50]], align 4, !tbaa ![[DATA_ACCESS_TAG:.*]]
+// [...]
+
+// CHECK: [[BOX_ACCESS_TAG]] = !{![[BOX_ACCESS_TYPE:.*]], ![[BOX_ACCESS_TYPE]], i64 0}
+// CHECK: ![[BOX_ACCESS_TYPE]] = !{!"descriptor member", ![[ANY_ACCESS_TYPE:.*]], i64 0}
+// CHECK: ![[ANY_ACCESS_TYPE]] = !{!"any access", ![[ROOT_TYPE:.*]], i64 0}
+// CHECK: ![[ROOT_TYPE]] = !{!"Flang function root _QPfunc"}
+// CHECK: ![[A_ACCESS_TAG]] = !{![[A_ACCESS_TYPE:.*]], ![[A_ACCESS_TYPE]], i64 0}
+// CHECK: ![[A_ACCESS_TYPE]] = !{!"dummy arg data/_QFfuncEa", ![[ARG_ACCESS_TYPE:.*]], i64 0}
+// CHECK: ![[ARG_ACCESS_TYPE]] = !{!"dummy arg data", ![[DATA_ACCESS_TYPE:.*]], i64 0}
+// CHECK: ![[DATA_ACCESS_TYPE]] = !{!"any data access", ![[ANY_ACCESS_TYPE]], i64 0}
+// CHECK: ![[DATA_ACCESS_TAG]] = !{![[DATA_ACCESS_TYPE]], ![[DATA_ACCESS_TYPE]], i64 0}
diff --git a/flang/test/Fir/tbaa.fir b/flang/test/Fir/tbaa.fir
index eabc9f30127fa3d..d260e4f4aec46a3 100644
--- a/flang/test/Fir/tbaa.fir
+++ b/flang/test/Fir/tbaa.fir
@@ -1,5 +1,7 @@
-// RUN: fir-opt %s --split-input-file --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu apply-tbaa=true" | FileCheck %s
-// RUN: fir-opt %s --split-input-file --fir-to-llvm-ir="target=aarch64-unknown-linux-gnu apply-tbaa=true" | FileCheck %s
+// test without per-function tbaa trees so that this functionality does not bitrot
+// per-function tbaa tbaa-codegen2.fir
+// RUN: fir-opt %s --split-input-file --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu apply-tbaa=true" --per-function-tbaa-trees=false | FileCheck %s
+// RUN: fir-opt %s --split-input-file --fir-to-llvm-ir="target=aarch64-unknown-linux-gnu apply-tbaa=true" --per-function-tbaa-trees=false | FileCheck %s
 
 module {
   func.func @tbaa(%arg0: !fir.class<!fir.array<?xnone>> {fir.bindc_name = "a"}) {
@@ -20,7 +22,7 @@ module {
   }
 }
 
-// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang Type TBAA Root">
+// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang function root ">
 // CHECK-DAG:     #[[ANYACC:.*]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // CHECK-DAG:     #[[ANYDACC:.*]] = #llvm.tbaa_type_desc<id = "any data access", members = {<#[[ANYACC]], 0>}>
 // CHECK-DAG:     #[[BOXMEM:.*]] = #llvm.tbaa_type_desc<id = "descriptor member", members = {<#[[ANYACC]], 0>}>
@@ -119,7 +121,7 @@ module {
   }
 }
 
-// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang Type TBAA Root">
+// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang function root ">
 // CHECK-DAG:     #[[ANYACC:.*]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // CHECK-DAG:     #[[BOXMEM:.*]] = #llvm.tbaa_type_desc<id = "descriptor member", members = {<#[[ANYACC]], 0>}>
 // CHECK-DAG:     #[[$BOXT:.*]] = #llvm.tbaa_tag<base_type = #[[BOXMEM]], access_type = #[[BOXMEM]], offset = 0>
@@ -245,7 +247,7 @@ func.func @tbaa(%arg0: !fir.box<!fir.array<*:f64>>) -> i32 {
   return %0 : i32
 }
 
-// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang Type TBAA Root">
+// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang function root ">
 // CHECK-DAG:     #[[ANYACC:.*]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // CHECK-DAG:     #[[BOXMEM:.*]] = #llvm.tbaa_type_desc<id = "descriptor member", members = {<#[[ANYACC]], 0>}>
 // CHECK-DAG:     #[[$BOXT:.*]] = #llvm.tbaa_tag<base_type = #[[BOXMEM]], access_type = #[[BOXMEM]], offset = 0>
@@ -264,7 +266,7 @@ func.func @tbaa(%arg0: !fir.box<!fir.array<*:f64>>) -> i1 {
   return %0 : i1
 }
 
-// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang Type TBAA Root">
+// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang function root ">
 // CHECK-DAG:     #[[ANYACC:.*]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // CHECK-DAG:     #[[BOXMEM:.*]] = #llvm.tbaa_type_desc<id = "descriptor member", members = {<#[[ANYACC]], 0>}>
 // CHECK-DAG:     #[[$BOXT:.*]] = #llvm.tbaa_tag<base_type = #[[BOXMEM]], access_type = #[[BOXMEM]], offset = 0>
@@ -285,7 +287,7 @@ func.func @tbaa(%arg0: !fir.box<f32>) -> i32 {
   return %0 : i32
 }
 
-// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang Type TBAA Root">
+// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang function root ">
 // CHECK-DAG:     #[[ANYACC:.*]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // CHECK-DAG:     #[[BOXMEM:.*]] = #llvm.tbaa_type_desc<id = "descriptor member", members = {<#[[ANYACC]], 0>}>
 // CHECK-DAG:     #[[$BOXT:.*]] = #llvm.tbaa_tag<base_type = #[[BOXMEM]], access_type = #[[BOXMEM]], offset = 0>
@@ -304,7 +306,7 @@ func.func @tbaa(%arg0: !fir.box<!fir.array<*:f64>>) -> i1 {
   return %0 : i1
 }
 
-// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang Type TBAA Root">
+// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang function root ">
 // CHECK-DAG:     #[[ANYACC:.*]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // CHECK-DAG:     #[[BOXMEM:.*]] = #llvm.tbaa_type_desc<id = "descriptor member", members = {<#[[ANYACC]], 0>}>
 // CHECK-DAG:     #[[$BOXT:.*]] = #llvm.tbaa_tag<base_type = #[[BOXMEM]], access_type = #[[BOXMEM]], offset = 0>
@@ -328,7 +330,7 @@ func.func @tbaa(%arg0: !fir.box<!fir.array<?xi32>>) {
   return
 }
 
-// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang Type TBAA Root">
+// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang function root ">
 // CHECK-DAG:     #[[ANYACC:.*]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // CHECK-DAG:     #[[BOXMEM:.*]] = #llvm.tbaa_type_desc<id = "descriptor member", members = {<#[[ANYACC]], 0>}>
 // CHECK-DAG:     #[[$BOXT:.*]] = #llvm.tbaa_tag<base_type = #[[BOXMEM]], access_type = #[[BOXMEM]], offset = 0>
@@ -356,7 +358,7 @@ func.func @tbaa(%arg0: !fir.box<!fir.array<?xi32>>) {
 
 // Check that the scalar aggregate load/store with a descriptor member
 // is mapped to any-access.
-// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang Type TBAA Root">
+// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang function root ">
 // CHECK-DAG:     #[[ANYACC:.*]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // CHECK-DAG:     #[[$ANYT:.*]] = #llvm.tbaa_tag<base_type = #[[ANYACC]], access_type = #[[ANYACC]], offset = 0>
 
@@ -373,7 +375,7 @@ func.func @tbaa(%arg0: !fir.ref<!fir.type<_QMtypesTt{x:!fir.box<!fir.heap<f32>>}
 
 // Check that the array aggregate load/store with a descriptor member
 // is mapped to any-access.
-// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang Type TBAA Root">
+// CHECK-DAG:     #[[ROOT:.*]] = #llvm.tbaa_root<id = "Flang function root ">
 // CHECK-DAG:     #[[ANYACC:.*]] = #llvm.tbaa_type_desc<id = "any access", members = {<#[[ROOT]], 0>}>
 // CHECK-DAG:     #[[$ANYT:.*]] = #llvm.tbaa_tag<base_type = #[[ANYACC]], access_type = #[[ANYACC]], offset = 0>
 

>From e6089132cc7a5498019e0b7140de3241877b9695 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Wed, 11 Oct 2023 09:23:56 -0700
Subject: [PATCH 21/38] [asan] Ensure __asan_register_elf_globals is called in
 COMDAT asan.module_ctor (#67745)

On ELF platforms, when there is no global variable and the unique module ID is
non-empty, COMDAT asan.module_ctor is created with no
`__asan_register_elf_globals` calls. If this COMDAT is the prevailing copy
selected by the linker, the linkage unit will have no
`__asan_register_elf_globals` call: the redzone will not be poisoned and ODR
violation checker will not work (#67677).

This behavior is benign for -fno-sanitize-address-globals-dead-stripping because
asan.module_ctor functions that call `__asan_register_globals`
(`InstrumentGlobalsWithMetadataArray`) do not use COMDAT.

To fix #67677:

* Use COMDAT for -fsanitize-address-globals-dead-stripping on ELF platforms.
* Call `__asan_register_elf_globals` even if there is no global variable.
* If the unique module ID is empty, don't call SetComdatForGlobalMetadata:
  placing `@.str` in a COMDAT would incorrectly discard internal COMDAT `@.str`
  in other compile units.

Alternatively, when there is no global variable, asan.module_ctor is not COMDAT
and does not call `__asan_register_elf_globals`. However, the asan.module_ctor
function cannot be eliminated by the linker.

Tested the following script. Only ELF -fsanitize-address-globals-dead-stripping has changed behaviors.
```
echo > a.cc  # no global variable, empty uniqueModuleId
echo 'void f() {}' > b.cc  # with global variable, with uniqueModuleId
echo 'int g;' > c.cc  # with global variable
for t in x86_64-linux-gnu arm64-apple-macosx x86_64-windows-msvc; do
  for gc in -f{,no-}sanitize-address-globals-dead-stripping; do
    for f in a.cc b.cc c.cc; do
      echo /tmp/Rel/bin/clang -S --target=$t -fsanitize=address $gc $f -o -
      /tmp/Rel/bin/clang -S --target=$t -fsanitize=address $gc $f -o - | sed -n '/asan.module_ctor/,/ret/p'
    done
  done
done
```

---

Identical to commit 16eed8c906875e748c3cb610f3dc4b875f3882aa.
6420d3301cd4f0793adcf11f59e8398db73737d8 is an incorrect revert for genuine
purely internal issues.
---
 .../Instrumentation/AddressSanitizer.cpp      | 61 ++++++++++---------
 .../Instrumentation/AddressSanitizer/basic.ll |  4 +-
 .../AddressSanitizer/global_metadata_array.ll | 19 ++++--
 .../AddressSanitizer/global_with_comdat.ll    | 50 ++++++++++++++-
 4 files changed, 97 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 9e9b192d442ce5e..e80ee1953de6b21 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -819,11 +819,11 @@ class ModuleAddressSanitizer {
 private:
   void initializeCallbacks(Module &M);
 
-  bool InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool *CtorComdat);
+  void instrumentGlobals(IRBuilder<> &IRB, Module &M, bool *CtorComdat);
   void InstrumentGlobalsCOFF(IRBuilder<> &IRB, Module &M,
                              ArrayRef<GlobalVariable *> ExtendedGlobals,
                              ArrayRef<Constant *> MetadataInitializers);
-  void InstrumentGlobalsELF(IRBuilder<> &IRB, Module &M,
+  void instrumentGlobalsELF(IRBuilder<> &IRB, Module &M,
                             ArrayRef<GlobalVariable *> ExtendedGlobals,
                             ArrayRef<Constant *> MetadataInitializers,
                             const std::string &UniqueModuleId);
@@ -2177,7 +2177,7 @@ void ModuleAddressSanitizer::InstrumentGlobalsCOFF(
     appendToCompilerUsed(M, MetadataGlobals);
 }
 
-void ModuleAddressSanitizer::InstrumentGlobalsELF(
+void ModuleAddressSanitizer::instrumentGlobalsELF(
     IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
     ArrayRef<Constant *> MetadataInitializers,
     const std::string &UniqueModuleId) {
@@ -2187,7 +2187,7 @@ void ModuleAddressSanitizer::InstrumentGlobalsELF(
   // false negative odr violations at link time. If odr indicators are used, we
   // keep the comdat sections, as link time odr violations will be dectected on
   // the odr indicator symbols.
-  bool UseComdatForGlobalsGC = UseOdrIndicator;
+  bool UseComdatForGlobalsGC = UseOdrIndicator && !UniqueModuleId.empty();
 
   SmallVector<GlobalValue *, 16> MetadataGlobals(ExtendedGlobals.size());
   for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
@@ -2237,7 +2237,7 @@ void ModuleAddressSanitizer::InstrumentGlobalsELF(
 
   // We also need to unregister globals at the end, e.g., when a shared library
   // gets closed.
-  if (DestructorKind != AsanDtorKind::None) {
+  if (DestructorKind != AsanDtorKind::None && !MetadataGlobals.empty()) {
     IRBuilder<> IrbDtor(CreateAsanModuleDtor(M));
     IrbDtor.CreateCall(AsanUnregisterElfGlobals,
                        {IRB.CreatePointerCast(RegisteredFlag, IntptrTy),
@@ -2343,10 +2343,8 @@ void ModuleAddressSanitizer::InstrumentGlobalsWithMetadataArray(
 // redzones and inserts this function into llvm.global_ctors.
 // Sets *CtorComdat to true if the global registration code emitted into the
 // asan constructor is comdat-compatible.
-bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
+void ModuleAddressSanitizer::instrumentGlobals(IRBuilder<> &IRB, Module &M,
                                                bool *CtorComdat) {
-  *CtorComdat = false;
-
   // Build set of globals that are aliased by some GA, where
   // getExcludedAliasedGlobal(GA) returns the relevant GlobalVariable.
   SmallPtrSet<const GlobalVariable *, 16> AliasedGlobalExclusions;
@@ -2364,11 +2362,6 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
   }
 
   size_t n = GlobalsToChange.size();
-  if (n == 0) {
-    *CtorComdat = true;
-    return false;
-  }
-
   auto &DL = M.getDataLayout();
 
   // A global is described by a structure
@@ -2391,8 +2384,11 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
 
   // We shouldn't merge same module names, as this string serves as unique
   // module ID in runtime.
-  GlobalVariable *ModuleName = createPrivateGlobalForString(
-      M, M.getModuleIdentifier(), /*AllowMerging*/ false, kAsanGenPrefix);
+  GlobalVariable *ModuleName =
+      n != 0
+          ? createPrivateGlobalForString(M, M.getModuleIdentifier(),
+                                         /*AllowMerging*/ false, kAsanGenPrefix)
+          : nullptr;
 
   for (size_t i = 0; i < n; i++) {
     GlobalVariable *G = GlobalsToChange[i];
@@ -2517,19 +2513,27 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
   }
   appendToCompilerUsed(M, ArrayRef<GlobalValue *>(GlobalsToAddToUsedList));
 
-  std::string ELFUniqueModuleId =
-      (UseGlobalsGC && TargetTriple.isOSBinFormatELF()) ? getUniqueModuleId(&M)
-                                                        : "";
-
-  if (!ELFUniqueModuleId.empty()) {
-    InstrumentGlobalsELF(IRB, M, NewGlobals, Initializers, ELFUniqueModuleId);
+  if (UseGlobalsGC && TargetTriple.isOSBinFormatELF()) {
+    // Use COMDAT and register globals even if n == 0 to ensure that (a) the
+    // linkage unit will only have one module constructor, and (b) the register
+    // function will be called. The module destructor is not created when n ==
+    // 0.
     *CtorComdat = true;
-  } else if (UseGlobalsGC && TargetTriple.isOSBinFormatCOFF()) {
-    InstrumentGlobalsCOFF(IRB, M, NewGlobals, Initializers);
-  } else if (UseGlobalsGC && ShouldUseMachOGlobalsSection()) {
-    InstrumentGlobalsMachO(IRB, M, NewGlobals, Initializers);
+    instrumentGlobalsELF(IRB, M, NewGlobals, Initializers,
+                         getUniqueModuleId(&M));
+  } else if (n == 0) {
+    // When UseGlobalsGC is false, COMDAT can still be used if n == 0, because
+    // all compile units will have identical module constructor/destructor.
+    *CtorComdat = TargetTriple.isOSBinFormatELF();
   } else {
-    InstrumentGlobalsWithMetadataArray(IRB, M, NewGlobals, Initializers);
+    *CtorComdat = false;
+    if (UseGlobalsGC && TargetTriple.isOSBinFormatCOFF()) {
+      InstrumentGlobalsCOFF(IRB, M, NewGlobals, Initializers);
+    } else if (UseGlobalsGC && ShouldUseMachOGlobalsSection()) {
+      InstrumentGlobalsMachO(IRB, M, NewGlobals, Initializers);
+    } else {
+      InstrumentGlobalsWithMetadataArray(IRB, M, NewGlobals, Initializers);
+    }
   }
 
   // Create calls for poisoning before initializers run and unpoisoning after.
@@ -2537,7 +2541,6 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
     createInitializerPoisonCalls(M, ModuleName);
 
   LLVM_DEBUG(dbgs() << M);
-  return true;
 }
 
 uint64_t
@@ -2601,10 +2604,10 @@ bool ModuleAddressSanitizer::instrumentModule(Module &M) {
     assert(AsanCtorFunction || ConstructorKind == AsanCtorKind::None);
     if (AsanCtorFunction) {
       IRBuilder<> IRB(AsanCtorFunction->getEntryBlock().getTerminator());
-      InstrumentGlobals(IRB, M, &CtorComdat);
+      instrumentGlobals(IRB, M, &CtorComdat);
     } else {
       IRBuilder<> IRB(*C);
-      InstrumentGlobals(IRB, M, &CtorComdat);
+      instrumentGlobals(IRB, M, &CtorComdat);
     }
   }
 
diff --git a/llvm/test/Instrumentation/AddressSanitizer/basic.ll b/llvm/test/Instrumentation/AddressSanitizer/basic.ll
index 068d6d18cd45eba..2064db5a0918fc4 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/basic.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/basic.ll
@@ -210,8 +210,10 @@ define void @test_swifterror_3() sanitize_address {
 
 ;; ctor/dtor have the nounwind attribute. See uwtable.ll, they additionally have
 ;; the uwtable attribute with the module flag "uwtable".
-; CHECK: define internal void @asan.module_ctor() #[[#ATTR:]] {{(comdat )?}}{
+; CHECK: define internal void @asan.module_ctor() #[[#ATTR:]] comdat {
 ; CHECK: call void @__asan_init()
+;; __asan_register_elf_globals is called even if this module does not contain instrumented global variables.
+; CHECK: call void @__asan_register_elf_globals(i64 ptrtoint (ptr @___asan_globals_registered to i64), i64 ptrtoint (ptr @__start_asan_globals to i64), i64 ptrtoint (ptr @__stop_asan_globals to i64))
 
 ; CHECK: attributes #[[#ATTR]] = { nounwind }
 
diff --git a/llvm/test/Instrumentation/AddressSanitizer/global_metadata_array.ll b/llvm/test/Instrumentation/AddressSanitizer/global_metadata_array.ll
index 0e4d1f168d0163f..1617bf9b67aa7c5 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/global_metadata_array.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/global_metadata_array.ll
@@ -1,8 +1,12 @@
-; RUN: opt < %s -passes=asan -asan-globals-live-support=0 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix=CHECK %s
-; RUN: opt < %s -passes=asan -asan-globals-live-support=0 -mtriple=x86_64-apple-macosx10.11.0 -S | FileCheck --check-prefix=CHECK %s
-; RUN: opt < %s -passes=asan -asan-globals-live-support=0 -mtriple=x86_64-pc-windows-msvc19.0.24215 -S | FileCheck --check-prefix=CHECK %s
-; RUN: opt < %s -passes=asan -asan-globals-live-support=0 -asan-mapping-scale=5 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s
+; RUN: rm -rf %t && split-file %s %t && cd %t
+; RUN: opt < a.ll -passes=asan -asan-globals-live-support=0 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix=CHECK %s
+; RUN: opt < a.ll -passes=asan -asan-globals-live-support=0 -mtriple=x86_64-apple-macosx10.11.0 -S | FileCheck --check-prefix=CHECK %s
+; RUN: opt < a.ll -passes=asan -asan-globals-live-support=0 -mtriple=x86_64-pc-windows-msvc19.0.24215 -S | FileCheck --check-prefix=CHECK %s
+; RUN: opt < a.ll -passes=asan -asan-globals-live-support=0 -asan-mapping-scale=5 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s
 
+; RUN: opt < empty.ll -passes=asan -asan-globals-live-support=0 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck --check-prefix=ELF-NOGC %s
+
+;--- a.ll
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Globals:
@@ -59,3 +63,10 @@ attributes #1 = { nounwind sanitize_address "less-precise-fpmad"="false" "frame-
 !7 = !{!"/tmp/asan-globals.cpp", i32 7, i32 5}
 !8 = !{!"/tmp/asan-globals.cpp", i32 12, i32 14}
 !9 = !{!"/tmp/asan-globals.cpp", i32 14, i32 25}
+
+;; In the presence of instrumented global variables, asan.module_ctor do not use comdat.
+; CHECK: define internal void @asan.module_ctor() #[[#ATTR:]] {
+
+; ELF-NOGC: define internal void @asan.module_ctor() #[[#ATTR:]] comdat {
+
+;--- empty.ll
diff --git a/llvm/test/Instrumentation/AddressSanitizer/global_with_comdat.ll b/llvm/test/Instrumentation/AddressSanitizer/global_with_comdat.ll
index 47bb1f102e2fc25..699b8287d358ad6 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/global_with_comdat.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/global_with_comdat.ll
@@ -4,10 +4,14 @@
 ; We keep using comdats for garbage collection if odr indicators are
 ; enabled as indicator symbols will cause link time odr violations.
 ; This is to fix PR 47925.
-;
-; RUN: opt < %s -passes=asan -asan-globals-live-support=1 -asan-use-odr-indicator=0 -S | FileCheck %s --check-prefixes=CHECK,NOCOMDAT
+; RUN: rm -rf %t && split-file %s %t && cd %t
+; RUN: opt < a.ll -passes=asan -asan-globals-live-support=1 -asan-use-odr-indicator=0 -S | FileCheck %s --check-prefixes=CHECK,NOCOMDAT
 ; Check that enabling odr indicators enables comdat for globals.
-; RUN: opt < %s -passes=asan -asan-globals-live-support=1 -S | FileCheck %s --check-prefixes=CHECK,COMDAT
+; RUN: opt < a.ll -passes=asan -asan-globals-live-support=1 -S | FileCheck %s --check-prefixes=CHECK,COMDAT
+
+; RUN: opt < no_module_id.ll -passes=asan -S | FileCheck %s --check-prefix=NOMODULEID
+
+;--- a.ll
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -87,3 +91,43 @@ attributes #1 = { nounwind sanitize_address "less-precise-fpmad"="false" "frame-
 !7 = !{!"/tmp/asan-globals.cpp", i32 7, i32 5}
 !8 = !{!"/tmp/asan-globals.cpp", i32 12, i32 14}
 !9 = !{!"/tmp/asan-globals.cpp", i32 14, i32 25}
+
+;--- no_module_id.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; NOMODULEID:      $asan.module_ctor = comdat any
+; NOMODULEID:      $asan.module_dtor = comdat any
+
+;; Don't place the instrumented globals in a comdat when the unique module ID is empty.
+; NOMODULEID:      @.str = internal constant { [4 x i8], [28 x i8] } { [4 x i8] c"str\00", [28 x i8] zeroinitializer }, align 32
+; NOMODULEID:      @_ZL3buf = internal global { [4 x i8], [28 x i8] } zeroinitializer, align 32
+; NOMODULEID:      @__asan_global_.str = private global {{.*}}, section "asan_globals", !associated !0
+; NOMODULEID:      @__asan_global__ZL3buf = private global {{.*}}, section "asan_globals", !associated !1
+; NOMODULEID:      @llvm.compiler.used = appending global [4 x ptr] [ptr @.str, ptr @_ZL3buf, ptr @__asan_global_.str, ptr @__asan_global__ZL3buf]
+
+; NOMODULEID:      define internal void @asan.module_ctor() #[[#]] comdat {
+; NOMODULEID-NEXT:   call void @__asan_init()
+; NOMODULEID-NEXT:   call void @__asan_version_mismatch_check_v8()
+; NOMODULEID-NEXT:   call void @__asan_register_elf_globals(i64 ptrtoint (ptr @___asan_globals_registered to i64), i64 ptrtoint (ptr @__start_asan_globals to i64), i64 ptrtoint (ptr @__stop_asan_globals to i64))
+; NOMODULEID-NEXT:   ret void
+; NOMODULEID-NEXT: }
+
+; NOMODULEID:      !0 = !{ptr @.str}
+; NOMODULEID:      !1 = !{ptr @_ZL3buf}
+
+ at .str = private unnamed_addr constant [4 x i8] c"str\00", align 1
+ at _ZL3buf = internal unnamed_addr global [4 x i8] zeroinitializer, align 1
+ at llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @_GLOBAL__sub_I_a.cc, ptr null }]
+
+declare void @ext(ptr noundef)
+
+; Function Attrs: uwtable
+define internal void @_GLOBAL__sub_I_a.cc() #2 section ".text.startup" {
+entry:
+  %0 = load i8, ptr @_ZL3buf, align 1
+  %inc = add i8 %0, 1
+  store i8 %inc, ptr @_ZL3buf, align 1
+  tail call void @ext(ptr noundef nonnull @.str)
+  ret void
+}

>From f8b28cd8607644344bda70461d548f1c8fe3d131 Mon Sep 17 00:00:00 2001
From: dankm <dan.mcgregor at usask.ca>
Date: Wed, 11 Oct 2023 10:28:04 -0600
Subject: [PATCH 22/38] [Driver] Silence stdlib warning when linking C on
 FreeBSD (#68011)

Similar to the Gnu toolchain, ignore uses of '-stdlib=libc++' when
linking C code. CMake insists on adding it to the command line when
linking C, and a bunch of other build systems do similarly.
---
 clang/lib/Driver/ToolChains/FreeBSD.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
index ff4d94c56f0d0ff..4d998e884f8ad42 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -356,6 +356,9 @@ void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   ToolChain.addProfileRTLibs(Args, CmdArgs);
 
+  // Silence warnings when linking C code with a C++ '-stdlib' argument.
+  Args.ClaimAllArgs(options::OPT_stdlib_EQ);
+
   const char *Exec = Args.MakeArgString(getToolChain().GetLinkerPath());
   C.addCommand(std::make_unique<Command>(JA, *this,
                                          ResponseFileSupport::AtFileCurCP(),

>From 44c37c81c752b02feb799c523283902b1b4aedf2 Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu at users.noreply.github.com>
Date: Wed, 11 Oct 2023 09:34:11 -0700
Subject: [PATCH 23/38] [mlir][sparse] simplify ConvertOp rewriting rules
 (#68350)

Canonicalize complex convertOp into multiple stages, such that it can
either be done by a direct conversion or by sorting.
---
 .../SparseTensor/IR/SparseTensorOps.td        |  11 +
 .../SparseTensor/IR/SparseTensorDialect.cpp   |  38 ++
 .../Transforms/SparseTensorCodegen.cpp        |  48 +++
 .../Transforms/SparseTensorRewriting.cpp      | 364 ++++++------------
 .../SparsificationAndBufferizationPass.cpp    |   1 +
 .../Transforms/StageSparseOperations.cpp      |  67 +++-
 .../SparseTensor/codegen_sparse_dealloc.mlir  |   3 +
 7 files changed, 277 insertions(+), 255 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
index 7ea5ca23f122a8a..042ae9693f486e6 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
@@ -195,6 +195,17 @@ def SparseTensor_ConvertOp : SparseTensor_Op<"convert",
     ```
 
   }];
+
+  let extraClassDeclaration = [{
+     // Whether the convert can be done by a single step (either a sort or a foreach),
+     // or it would require a tmp buffer (sort, then foreach).
+     bool directConvertable();
+
+     // Whether the convert is actually a sort coo
+     // TODO: The method will be removed when sort_coo operation is introduced.
+     bool isSortCOOConvert();
+  }];
+
   let assemblyFormat = "$source attr-dict `:` type($source) `to` type($dest)";
   let hasFolder = 1;
   let hasVerifier = 1;
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index 96ed5f13b9d9ecb..5b84d2158bc8280 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -1066,6 +1066,44 @@ OpFoldResult ConvertOp::fold(FoldAdaptor adaptor) {
   return {};
 }
 
+bool ConvertOp::directConvertable() {
+  if (isSortCOOConvert())
+    return false;
+
+  SparseTensorType srcStt = getSparseTensorType(getSource());
+  SparseTensorType dstStt = getSparseTensorType(getDest());
+
+  // We can always directly convert to unordered sparse tensor or dense tensor
+  // since dense tensor support random access.
+  if (dstStt.isAllDense() || !dstStt.isAllOrdered())
+    return true;
+
+  if (srcStt.isAllOrdered() && dstStt.isAllOrdered() &&
+      srcStt.hasSameDimToLvl(dstStt)) {
+    return true;
+  }
+
+  // Source and dest tensors are ordered in different ways. We only do direct
+  // dense to sparse conversion when the dense input is defined by a sparse
+  // constant. Note that we can theoretically always directly convert from dense
+  // inputs by rotating dense loops but it leads to bad cache locality and hurt
+  // performance.
+  if (auto constOp = getSource().getDefiningOp<arith::ConstantOp>())
+    if (isa<SparseElementsAttr>(constOp.getValue()))
+      return true;
+
+  return false;
+}
+
+bool ConvertOp::isSortCOOConvert() {
+  // TODO: we should instead use a different sort_coo operation to handle
+  // the conversion between COOs (but with different ordering).
+  return isUniqueCOOType(getSource().getType()) &&
+         isUniqueCOOType(getDest().getType()) &&
+         !getSparseTensorType(getSource()).isAllOrdered() &&
+         getSparseTensorType(getDest()).isAllOrdered();
+}
+
 LogicalResult ToPositionsOp::verify() {
   auto e = getSparseTensorEncoding(getTensor().getType());
   if (failed(lvlIsInBounds(getLevel(), getTensor())))
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index e22789643c90af7..fdecfe303d31351 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -679,6 +679,50 @@ class SparseDimOpConverter : public OpConversionPattern<tensor::DimOp> {
   }
 };
 
+// TODO: use a new SortCOO operation here instead of reusing convert op.
+struct SparseSortCOOConverter : public OpConversionPattern<ConvertOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(ConvertOp op, ConvertOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Direct conversion should have already been lowered.
+    if (!op.isSortCOOConvert())
+      return failure();
+
+    Location loc = op.getLoc();
+    MLIRContext *ctx = op.getContext();
+
+    SparseTensorType srcStt = getSparseTensorType(op.getSource());
+    SparseTensorType dstStt = getSparseTensorType(op.getDest());
+
+    // TODO: This should be verification rules for sort_coo operation.
+    assert(dstStt.isAllOrdered() && !srcStt.isAllOrdered() &&
+           isUniqueCOOType(srcStt.getRankedTensorType()) &&
+           isUniqueCOOType(dstStt.getRankedTensorType()));
+
+    assert(dstStt.hasSameDimToLvl(srcStt));
+
+    // We don't need a mutable descriptor here as we perform sorting in-place.
+    auto nnz = genValMemSize(rewriter, op.getLoc(), adaptor.getSource());
+    auto desc = getDescriptorFromTensorTuple(adaptor.getSource());
+    auto crd = desc.getAOSMemRef();
+    auto val = desc.getValMemRef();
+
+    // Otherwise we need another data shuffle and a non-identity map.
+    assert(dstStt.hasSameDimToLvl(srcStt));
+    auto id = AffineMap::getMultiDimIdentityMap(srcStt.getLvlRank(), ctx);
+
+    rewriter.create<SortOp>(loc, nnz, crd, ValueRange{val}, id,
+                            rewriter.getIndexAttr(0),
+                            SparseTensorSortKind::HybridQuickSort);
+
+    // Since we do in-place sorting, the destinate tensor will have the same set
+    // of memrefs as the source tensor.
+    rewriter.replaceOp(op, adaptor.getSource());
+    return success();
+  }
+};
+
 template <typename Op, StorageSpecifierKind kind>
 class SparseSliceGetterOpConverter : public OpConversionPattern<Op> {
 public:
@@ -1101,6 +1145,9 @@ class SparseConvertConverter : public OpConversionPattern<ConvertOp> {
   LogicalResult
   matchAndRewrite(ConvertOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    if (op.isSortCOOConvert())
+      return failure();
+
     SparseTensorEncodingAttr encDst = getSparseTensorEncoding(op.getType());
     SparseTensorEncodingAttr encSrc =
         getSparseTensorEncoding(op.getSource().getType());
@@ -1554,6 +1601,7 @@ void mlir::populateSparseTensorCodegenPatterns(
                SparseCastConverter, SparseExtractSliceConverter,
                SparseTensorLoadConverter, SparseExpandConverter,
                SparseCompressConverter, SparseInsertConverter,
+               SparseSortCOOConverter,
                SparseSliceGetterOpConverter<ToSliceOffsetOp,
                                             StorageSpecifierKind::DimOffset>,
                SparseSliceGetterOpConverter<ToSliceStrideOp,
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index b0bd22b156cc292..592852f87ba1e04 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -147,8 +147,7 @@ static RankedTensorType getBufferType(const SparseTensorType &stt,
 /// Collects the dynamic dimension sizes for `tp` with the assumption that
 /// `sizes` are the dimension sizes for the type. Stores the dynamic dimension
 /// sizes to dynSizes.
-static void getDynamicSizes(RankedTensorType tp,
-                            const SmallVectorImpl<Value> &sizes,
+static void getDynamicSizes(RankedTensorType tp, ValueRange sizes,
                             SmallVectorImpl<Value> &dynSizes) {
   for (const auto &d : enumerate(tp.getShape())) {
     if (d.value() == ShapedType::kDynamic)
@@ -884,8 +883,7 @@ struct ConcatenateRewriter : public OpRewritePattern<ConcatenateOp> {
       }
 
       needTmpCOO = !allDense && !allOrdered;
-      const RankedTensorType tp =
-          getBufferType(dstTp.withoutDimToLvl(), needTmpCOO);
+      const RankedTensorType tp = getBufferType(dstTp, needTmpCOO);
       encDst = needTmpCOO ? getSparseTensorEncoding(tp) : encDst;
       SmallVector<Value> dynSizes;
       getDynamicSizes(dstTp, sizes, dynSizes);
@@ -971,7 +969,10 @@ struct ConcatenateRewriter : public OpRewritePattern<ConcatenateOp> {
       dst = rewriter.create<LoadOp>(loc, dst, true);
       if (needTmpCOO) {
         Value tmpCoo = dst;
-        dst = rewriter.create<ConvertOp>(loc, dstRTT, tmpCoo).getResult();
+        Type dstCooTp = getCOOType(dstRTT, true);
+        // TODO: this should be a sort_coo operation.
+        dst = rewriter.create<ConvertOp>(loc, dstCooTp, tmpCoo).getResult();
+        dst = rewriter.create<ConvertOp>(loc, dstRTT, dst).getResult();
         rewriter.create<DeallocTensorOp>(loc, tmpCoo);
       }
       rewriter.replaceOp(op, dst);
@@ -980,11 +981,60 @@ struct ConcatenateRewriter : public OpRewritePattern<ConcatenateOp> {
   }
 };
 
-/// Sparse rewriting rule for the convert operator.
-struct ConvertRewriter : public OpRewritePattern<ConvertOp> {
+struct TensorLike {
+  TensorLike(OpBuilder &builder, Location loc, RankedTensorType rtt,
+             ValueRange sizes)
+      : isSparse(rtt.getEncoding() != nullptr) {
+    SmallVector<Value> dynSzs;
+    getDynamicSizes(rtt, sizes, dynSzs);
+
+    if (isSparse)
+      val = builder.create<AllocTensorOp>(loc, rtt, dynSzs);
+    else
+      val = allocDenseTensor(builder, loc, rtt, sizes);
+  };
+
+  void insertOrStore(OpBuilder &builder, Location loc, Value v,
+                     ValueRange crds) {
+    if (isSparse)
+      val = builder.create<InsertOp>(loc, v, val, crds);
+    else
+      builder.create<memref::StoreOp>(loc, v, val, crds);
+  }
+
+  Value getSSA() const {
+    // We don't need to maintain the SSA chain for a memref value.
+    return isSparse ? val : nullptr;
+  }
+
+  Value finalize(OpBuilder &builder, Location loc, RankedTensorType rtp) const {
+    if (isSparse)
+      return builder.create<LoadOp>(loc, val, true);
+    return builder.create<bufferization::ToTensorOp>(loc, rtp, val);
+  }
+
+  void updateSSA(Value v) {
+    // Dense memref is a non-SSA value.
+    assert(isSparse);
+    val = v;
+  }
+
+private:
+  bool isSparse;
+  Value val; // either a memref (for dense tensor) or a sparse tensor.
+};
+
+struct DirectConvertRewriter : public OpRewritePattern<ConvertOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(ConvertOp op,
                                 PatternRewriter &rewriter) const override {
+    if (!op.directConvertable() && !op.isSortCOOConvert())
+      return op.emitError("ConvertOp not in conanical form.");
+
+    if (op.isSortCOOConvert())
+      return failure();
+
+    // TODO: Maybe we want a different operation for this too.
     auto encDst = getSparseTensorEncoding(op.getType());
     auto encSrc = getSparseTensorEncoding(op.getSource().getType());
     if (encDst && encSrc && !encSrc.isSlice() &&
@@ -993,272 +1043,79 @@ struct ConvertRewriter : public OpRewritePattern<ConvertOp> {
       // in codegen.
       return failure();
     }
-    // TODO: Add a cast before generating InsertOp.
-    assert(op.getSource().getType().getElementType() ==
-           op.getDest().getType().getElementType());
-    if (encSrc && encDst)
-      return sparse2SparseRewrite(op, rewriter);
-    if (encSrc && !encDst)
-      return sparse2DenseRewrite(op, rewriter);
-    if (!encSrc && encDst)
-      return dense2SparseRewrite(op, rewriter);
-
-    // Dense-to-dense convert is a nop and handled by canonicalization.
-    return failure();
-  }
 
-private:
-  // Handles sparse constant to sparse tensor or dense tensor to sparse tensor
-  // conversion as follows:
-  //   t = new sparse COO tensor
-  //   fill t using src
-  //   dst = convert t
-  //
-  // To fill the COO tensor from a dense tensor:
-  //   for i1 in dim1
-  //    ..
-  //     for ik in dimk
-  //       val = a[i1,..,ik]
-  //       if val != 0
-  //         t->add(val, [i1,..,ik], [p1,..,pk])
-  //
-  // To fill the COO tensor from a sparse constant in COO format:
-  //   for i in range(NNZ)
-  //     val = values[i]
-  //     [i1,..,ik] = coordinates[i]
-  //     t->add(val, [i1,..,ik], [p1,..,pk])
-  LogicalResult dense2SparseRewrite(ConvertOp op,
-                                    PatternRewriter &rewriter) const {
     Location loc = op.getLoc();
     Value src = op.getSource();
-    const auto dstTp = getSparseTensorType(op);
-    SmallVector<Value> sizes;
-    sizesFromSrc(rewriter, sizes, loc, src);
-    SmallVector<Value> dynSizes;
-    getDynamicSizes(dstTp, sizes, dynSizes);
+
+    SparseTensorType srcStt = getSparseTensorType(op.getSource());
+    SparseTensorType dstStt = getSparseTensorType(op.getDest());
 
     bool fromSparseConst = false;
-    if (auto constOp = op.getSource().getDefiningOp<arith::ConstantOp>()) {
-      if (dyn_cast<SparseElementsAttr>(constOp.getValue())) {
+    if (auto constOp = op.getSource().getDefiningOp<arith::ConstantOp>())
+      if (dyn_cast<SparseElementsAttr>(constOp.getValue()))
         fromSparseConst = true;
-      }
-    }
 
-    const auto encDst = dstTp.getEncoding();
-    // We don't need a temporary COO tensor if the destination has an identity
-    // ordering. Otherwise, we use the destination ordering for the temporary
-    // COO tensor.
-    // TODO: enhance foreachOp to take ordering to remove the need of a
-    // temporary COO tensor here.
-    const RankedTensorType bufferTp =
-        getBufferType(dstTp, !dstTp.isIdentity() && !fromSparseConst);
-    // Only imposes foreach order on dense constant (which will be statically
-    // sorted by the sparse compiler), otherwise the rotated loop sequence
-    // results to bad cache locality.
     const AffineMapAttr foreachOrder =
-        (!dstTp.isIdentity() && fromSparseConst)
-            ? AffineMapAttr::get(dstTp.getExpandedDimToLvl())
+        (!dstStt.isIdentity() && fromSparseConst)
+            ? AffineMapAttr::get(dstStt.getExpandedDimToLvl())
             : nullptr;
-    // TODO: This assertion is to match the behavior from before we merged
-    // dimOrdering and higherOrdering into dimToLvl.  Although the above
-    // can construct `foreachOrder` for non-permutations, it's not clear
-    // that the `foreachOp` below actually supports non-permutations.
-    assert(!foreachOrder || dstTp.isPermutation());
-
-    auto buffer =
-        rewriter.create<AllocTensorOp>(loc, bufferTp, dynSizes).getResult();
+
+    bool skipZeroCheck = srcStt.hasEncoding() || fromSparseConst;
+
+    SmallVector<Value> sizes;
+    sizesFromSrc(rewriter, sizes, loc, src);
+    ValueRange vs;
+    TensorLike dstBuf(rewriter, loc, dstStt.getRankedTensorType(), sizes);
+
+    Value iterArg = dstBuf.getSSA();
     auto foreachOp = rewriter.create<ForeachOp>(
-        loc, src, buffer, foreachOrder,
+        loc, src, iterArg ? ValueRange{iterArg} : ValueRange{}, foreachOrder,
         [&](OpBuilder &builder, Location loc, ValueRange dcvs, Value v,
             ValueRange reduc) {
-          Value input = reduc.front();
-          const Dimension dimRank = dstTp.getDimRank();
-          const Level lvlRank = dstTp.getLvlRank();
+          // Enters the loop, update the SSA value for insertion chain.
+          if (!reduc.empty())
+            dstBuf.updateSSA(reduc.front());
+
+          const Dimension dimRank = dstStt.getDimRank();
+          const Level lvlRank = dstStt.getLvlRank();
           SmallVector<Value> lcvs(lvlRank);
-          for (Dimension d = 0; d < dimRank; d++)
+          for (Dimension d = 0; d < dimRank; d++) {
             // FIXME: `toStoredDim` is deprecated
-            lcvs[toStoredDim(encDst, d)] = dcvs[d];
-          if (fromSparseConst) {
-            input = builder.create<InsertOp>(loc, v, input, lcvs);
-          } else {
+            lcvs[toStoredDim(dstStt.getEncoding(), d)] = dcvs[d];
+          }
+
+          if (!skipZeroCheck) {
+            assert(!reduc.empty());
             Value cond = genIsNonzero(builder, loc, v);
-            auto ifOp = builder.create<scf::IfOp>(
-                loc, TypeRange(input.getType()), cond, /*else*/ true);
-            builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-            Value insert = builder.create<InsertOp>(loc, v, input, lcvs);
-            builder.create<scf::YieldOp>(loc, insert);
+            auto ifOp = builder.create<scf::IfOp>(loc, reduc.getTypes(), cond,
+                                                  /*else*/ true);
             builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-            builder.create<scf::YieldOp>(loc, input);
+            builder.create<scf::YieldOp>(loc, dstBuf.getSSA());
+
+            builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+            dstBuf.insertOrStore(builder, loc, v, lcvs);
+            builder.create<scf::YieldOp>(loc, dstBuf.getSSA());
+
+            // Exits the ifOp, update the sparse tensor SSA value.
             builder.setInsertionPointAfter(ifOp);
-            input = ifOp.getResult(0);
+            dstBuf.updateSSA(ifOp.getResult(0));
+          } else {
+            dstBuf.insertOrStore(builder, loc, v, lcvs);
           }
-          builder.create<sparse_tensor::YieldOp>(loc, input);
+          if (reduc.empty())
+            builder.create<sparse_tensor::YieldOp>(loc);
+          else
+            builder.create<sparse_tensor::YieldOp>(loc, dstBuf.getSSA());
         });
-    rewriter.setInsertionPointAfter(op);
-    src = rewriter.create<LoadOp>(loc, foreachOp.getResult(0), true);
-    if (bufferTp != dstTp) {
-      rewriter.replaceOpWithNewOp<ConvertOp>(op, dstTp.getRankedTensorType(),
-                                             src);
-      rewriter.create<DeallocTensorOp>(loc, src);
-    } else {
-      rewriter.replaceOp(op, src);
-    }
-
-    return success();
-  }
-
-  // Handles sparse tensor to dense tensor conversion as follows:
-  //   dst = new dense tensor;
-  //   foreach elemment in src
-  //     dst[element.coords] = element.value
-  LogicalResult sparse2DenseRewrite(ConvertOp op,
-                                    PatternRewriter &rewriter) const {
-    Location loc = op->getLoc();
-    RankedTensorType dstTp = getRankedTensorType(op);
-    Value src = op.getSource();
-    RankedTensorType srcTp = getRankedTensorType(src);
-
-    SmallVector<Value> sizes;
-    sizesForTensor(rewriter, sizes, loc, srcTp, src);
-
-    Value dst = allocDenseTensor(rewriter, loc, dstTp, sizes);
-
-    rewriter.create<ForeachOp>(loc, src, std::nullopt,
-                               [&](OpBuilder &builder, Location loc,
-                                   ValueRange args, Value v, ValueRange reduc) {
-                                 builder.create<memref::StoreOp>(loc, v, dst,
-                                                                 args);
-                                 builder.create<sparse_tensor::YieldOp>(loc);
-                               });
 
-    rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, dstTp, dst);
-    return success();
-  }
+    rewriter.setInsertionPointAfter(foreachOp);
 
-  // Handles sparse tensor to sparse tensor conversion as follows:
-  //   if src is not COO
-  //       construct a COO to represent the src
-  //   sort the src COO
-  //   foreach elemment in the sorted src COO
-  //     insert element to dst
-  LogicalResult sparse2SparseRewrite(ConvertOp op,
-                                     PatternRewriter &rewriter) const {
-    const Location loc = op->getLoc();
-    // These two variables cannot be `const` because they're conditionally
-    // changed below.  Ideally we'd use `SparseTensorType` for `srcRTT`;
-    // however that class's copy-ctor is implicitly deleted.
-    Value src = op.getSource();
-    auto srcRTT = getRankedTensorType(src);
-    const auto dstTp = getSparseTensorType(op);
-    const auto encDst = dstTp.getEncoding();
-    const Level dstLvlRank = dstTp.getLvlRank();
-    const Dimension dimRank = dstTp.getDimRank();
-    // This assertion should be guaranteed by validity of the op,
-    // but just for paranoia's sake.
-    assert(static_cast<Dimension>(srcRTT.getRank()) == dimRank);
-
-    SmallVector<Value> srcSizes;
-    sizesForTensor(rewriter, srcSizes, loc, srcRTT, src);
-    Value tmpCoo = Value();
-    Value nnz = rewriter.create<NumberOfEntriesOp>(loc, src);
-    // We need a tmp COO buffer if and only if
-    // 1. the src tensor is not a COO and
-    // 2. the src tensor is not ordered in the same way as the target
-    // tensor (e.g., src tensor is not ordered or src tensor haves a different
-    // dimToLvl).
-    if (const SparseTensorType srcTp(srcRTT);
-        !(srcTp.isAllOrdered() && srcTp.hasSameDimToLvl(dstTp))) {
-      // Construct a COO tensor from the src tensor.
-      // TODO: there may be cases for which more efficiently without
-      // going through an intermediate COO, such as cases that only change
-      // the overhead types.
-      SmallVector<Value> dynSrcSizes;
-      getDynamicSizes(srcRTT, srcSizes, dynSrcSizes);
-      srcRTT = getCOOType(srcTp.withDimToLvl(dstTp), /*ordered=*/false);
-      // Ensure that mutating `srcRTT` didn't invalidate `dimRank`.
-      assert(static_cast<Dimension>(srcRTT.getRank()) == dimRank);
-      tmpCoo = rewriter
-                   .create<AllocTensorOp>(loc, srcRTT, dynSrcSizes, Value(),
-                                          /*sizeHint=*/nnz, Attribute())
-                   .getResult();
-      auto foreachOp = rewriter.create<ForeachOp>(
-          loc, src, tmpCoo,
-          [&](OpBuilder &builder, Location loc, ValueRange dcvs, Value v,
-              ValueRange reduc) {
-            SmallVector<Value> dstLcvs(dstLvlRank);
-            for (Dimension d = 0; d < dimRank; d++) {
-              // FIXME: `toStoredDim` is deprecated
-              Level l = toStoredDim(encDst, d);
-              dstLcvs[l] = dcvs[d];
-            }
-            auto t = builder.create<InsertOp>(loc, v, reduc.front(), dstLcvs);
-            builder.create<sparse_tensor::YieldOp>(loc, t);
-          });
-      src = rewriter.create<LoadOp>(loc, foreachOp.getResult(0), true);
-    }
-
-    // Now that the conditional is done, we can use `SparseTensorType`.
-    const SparseTensorType srcTp(srcRTT);
-
-    // Only need to sort if the srcTp is not already sorted (we faithfully take
-    // the guarantee from the sparse tensor encoding).
-    if (!srcTp.isAllOrdered()) {
-      // Retrieve the values-array.
-      Value y = genToValues(rewriter, loc, src);
-      const auto encSrc = srcTp.getEncoding();
-      // Builds the dstLvl -> srcLvl permutation maps.
-      SmallVector<AffineExpr> es(dstLvlRank);
-      const Level srcLvlRank = srcTp.getLvlRank();
-      for (Level srcLvl = 0; srcLvl < srcLvlRank; srcLvl++) {
-        // FIXME: `toOrigDim` is deprecated
-        Dimension dim = toOrigDim(encSrc, srcLvl);
-        // FIXME: `toStoredDim` is deprecated
-        Level dstLvl = toStoredDim(encDst, dim);
-        es[dstLvl] = rewriter.getAffineDimExpr(srcLvl);
-      }
-      auto xPerm = AffineMap::get(dstLvlRank, 0, es, rewriter.getContext());
-      assert(xPerm.isPermutation()); // must be a permutation.
-
-      Value xs = genToCoordinatesBuffer(rewriter, loc, src);
-      rewriter.create<SortOp>(loc, nnz, xs, ValueRange{y}, xPerm,
-                              rewriter.getIndexAttr(0),
-                              SparseTensorSortKind::HybridQuickSort);
-    }
-
-    // For each element in the COO tensor, insert the element to the dst tensor.
-    SmallVector<Value> dynDstSizes;
-    getDynamicSizes(dstTp, srcSizes, dynDstSizes);
-    Value dst = rewriter
-                    .create<AllocTensorOp>(loc, dstTp.getRankedTensorType(),
-                                           dynDstSizes, Value(),
-                                           /*sizeHint=*/nnz, Attribute())
-                    .getResult();
-    SmallVector<Value> dstLcvs(dstLvlRank);
-    auto foreachOp = rewriter.create<ForeachOp>(
-        loc, src, dst,
-        [&](OpBuilder &builder, Location loc, ValueRange dcvs, Value v,
-            ValueRange reduc) {
-          for (Dimension d = 0; d < dimRank; d++) {
-            // FIXME: `toStoredDim` is deprecated
-            Level l = toStoredDim(encDst, d);
-            dstLcvs[l] = dcvs[d];
-          }
-          auto t = builder.create<InsertOp>(loc, v, reduc.front(), dstLcvs);
-          builder.create<sparse_tensor::YieldOp>(loc, t);
-        });
+    // Exits the for loop, links the SSA chain.
+    if (!foreachOp.getResults().empty())
+      dstBuf.updateSSA(foreachOp.getResult(0));
 
-    // Release the temporary COO if it is created. Note that tmpCoo is
-    // invalidated due to foreach and updated to src.
-    if (tmpCoo)
-      rewriter.create<DeallocTensorOp>(loc, src);
-
-    // Directly replace op with dst results in bufferization error message
-    // "sparse tensor allocation should not escape function".
-    // As such, we insert a trivial tensor convert which will be removed by
-    // codegen.
-    rewriter.setInsertionPointAfter(op);
-    auto t = rewriter.create<LoadOp>(loc, foreachOp.getResult(0), true);
-    rewriter.replaceOpWithNewOp<ConvertOp>(op, dstTp.getRankedTensorType(), t);
+    Value ret = dstBuf.finalize(rewriter, loc, dstStt.getRankedTensorType());
+    rewriter.replaceOp(op, ret);
     return success();
   }
 };
@@ -1482,10 +1339,11 @@ void mlir::populatePostSparsificationRewriting(RewritePatternSet &patterns,
   if (enableForeach)
     patterns.add<ForeachRewriter>(patterns.getContext());
 
-  // TODO: If RT not enabled, rewrite concatenate ops, etc here.
   if (!enableRT) {
     patterns.add<NewRewriter, OutRewriter>(patterns.getContext());
+    // TODO: Move this to a common path for both lib/codegen when libgen support
+    // lowering sort_coo.
     if (enableConvert)
-      patterns.add<ConvertRewriter>(patterns.getContext());
+      patterns.add<DirectConvertRewriter>(patterns.getContext());
   }
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
index 480e18e257277de..552a29f66769399 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
@@ -142,6 +142,7 @@ class SparsificationAndBufferizationPass
     {
       OpPassManager pm("builtin.module");
       pm.addPass(createSparsificationPass(sparsificationOptions));
+      pm.addNestedPass<func::FuncOp>(createStageSparseOperationsPass());
       pm.addPass(createPostSparsificationRewritePass(enableRuntimeLibrary));
       if (vectorLength > 0) {
         pm.addPass(mlir::createLoopInvariantCodeMotionPass());
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp
index 4adc4d131198cc7..60ac71de4dd71ca 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp
@@ -1,4 +1,67 @@
+//===- StageSparseOperations.cpp - stage sparse ops rewriting rules -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
+#include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
 
-void mlir::populateStageSparseOperationsPatterns(
-    RewritePatternSet & /*patterns*/) {}
+using namespace mlir;
+using namespace mlir::sparse_tensor;
+
+namespace {
+
+struct StageUnorderedConvert : public OpRewritePattern<ConvertOp> {
+  using OpRewritePattern<ConvertOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ConvertOp op,
+                                PatternRewriter &rewriter) const override {
+    // TODO: Implement it as an Interface, this can be reused from other
+    // operations too (e.g., concatenate, reshape, etc).
+
+    if (op.directConvertable() || op.isSortCOOConvert())
+      return failure();
+
+    Location loc = op.getLoc();
+    SparseTensorType srcStt = getSparseTensorType(op.getSource());
+    SparseTensorType dstStt = getSparseTensorType(op.getDest());
+
+    // Just to make sure that convert to dense tensor is always direct.
+    assert(!dstStt.isAllDense());
+
+    // source -> coo
+    // The tmp COO must be unordered, otherwise it is a direct conversion.
+    assert(!(srcStt.hasSameDimToLvl(dstStt) && srcStt.isAllOrdered()));
+    Type srcCOOTp = getCOOFromTypeWithOrdering(
+        dstStt.getRankedTensorType(), dstStt.getDimToLvl(), /*ordered=*/false);
+    Value srcCOO = rewriter.create<ConvertOp>(loc, srcCOOTp, op.getSource());
+
+    // -> sort
+    Type dstCOOTp = getCOOFromTypeWithOrdering(
+        dstStt.getRankedTensorType(), dstStt.getDimToLvl(), /*ordered=*/true);
+    // TODO: this should be a sort_coo operation.
+    Value dstCOO = rewriter.create<ConvertOp>(loc, dstCOOTp, srcCOO);
+
+    // -> dest.
+    if (dstCOO.getType() == op.getType()) {
+      rewriter.replaceOp(op, dstCOO);
+    } else {
+      // Need an extra conversion if the target type is not COO.
+      rewriter.replaceOpWithNewOp<ConvertOp>(op, op.getDest().getType(),
+                                             dstCOO);
+    }
+    // TODO: deallocate extra COOs, we should probably delegate it to buffer
+    // deallocation pass.
+
+    return success();
+  }
+};
+} // namespace
+
+void mlir::populateStageSparseOperationsPatterns(RewritePatternSet &patterns) {
+  patterns.add<StageUnorderedConvert>(patterns.getContext());
+}
diff --git a/mlir/test/Dialect/SparseTensor/codegen_sparse_dealloc.mlir b/mlir/test/Dialect/SparseTensor/codegen_sparse_dealloc.mlir
index 59e568dd5de6461..49994a33c1911c6 100644
--- a/mlir/test/Dialect/SparseTensor/codegen_sparse_dealloc.mlir
+++ b/mlir/test/Dialect/SparseTensor/codegen_sparse_dealloc.mlir
@@ -1,3 +1,6 @@
+// UNSUPPORTED: target={{.*}}
+// TODO: the test is temporarily disabled (we probably do not need the option anymore by switch to buffer deallcation pass)
+//
 // RUN: mlir-opt %s --post-sparsification-rewrite="enable-runtime-library=false" \
 // RUN:    --sparse-tensor-codegen=create-sparse-deallocs=false \
 // RUN:    --canonicalize --cse | FileCheck %s -check-prefix=CHECK-NO-DEALLOC

>From c3cdf7aaeb0fada359cb43e2390bf7cbe88dfcbf Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Wed, 11 Oct 2023 16:37:40 +0000
Subject: [PATCH 24/38] [flang] Add missing dependency FIRCodeGen ->
 FIRAnalysis

After https://github.com/llvm/llvm-project/pull/68437
---
 flang/lib/Optimizer/CodeGen/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flang/lib/Optimizer/CodeGen/CMakeLists.txt b/flang/lib/Optimizer/CodeGen/CMakeLists.txt
index 0b2e3943c1068dd..0daa97b00dfa00c 100644
--- a/flang/lib/Optimizer/CodeGen/CMakeLists.txt
+++ b/flang/lib/Optimizer/CodeGen/CMakeLists.txt
@@ -14,6 +14,7 @@ add_flang_library(FIRCodeGen
   CGOpsIncGen
 
   LINK_LIBS
+  FIRAnalysis
   FIRBuilder
   FIRDialect
   FIRDialectSupport

>From 0e425fd7b080476c9c460859e7f18a7fcc96b9a0 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 11 Oct 2023 17:57:42 +0100
Subject: [PATCH 25/38] [AArch64] Fix postinc operands for Neoverse-N2
 scheduling

Similar to D159254, this fixes the order of WriteAdr operands on post/pre-inc
loads/stores in the Neoverse-N2 scheduling model.
---
 .../Target/AArch64/AArch64SchedNeoverseN2.td  |  102 +-
 .../llvm-mca/AArch64/Neoverse/N2-writeback.s  | 3839 ++++++++---------
 2 files changed, 1965 insertions(+), 1976 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
index a6e28e653e33c17..517d0da7f47f428 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
@@ -742,7 +742,7 @@ def : SchedAlias<WriteLDIdx, N2Write_4cyc_1I_1L>;
 // Load pair, signed immed offset, signed words
 def : InstRW<[N2Write_5cyc_1M0, WriteLDHi], (instrs LDPSWi)>;
 // Load pair, immed post-index or immed pre-index, signed words
-def : InstRW<[N2Write_5cyc_1M0, WriteLDHi, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_5cyc_1M0, WriteLDHi],
              (instregex "^LDPSW(post|pre)$")>;
 
 // Store instructions
@@ -860,7 +860,7 @@ def : InstRW<[N2Write_6cyc_1L], (instregex "^LDR[SDQ]l$",
 // Load vector reg, immed post-index
 def : InstRW<[N2Write_6cyc_1I_1L, WriteI], (instregex "^LDR[BHSDQ]post$")>;
 // Load vector reg, immed pre-index
-def : InstRW<[N2Write_6cyc_1I_1L, WriteAdr], (instregex "^LDR[BHSDQ]pre$")>;
+def : InstRW<[WriteAdr, N2Write_6cyc_1I_1L], (instregex "^LDR[BHSDQ]pre$")>;
 
 // Load vector reg, unsigned immed
 def : InstRW<[N2Write_6cyc_1L], (instregex "^LDR[BHSDQ]ui$")>;
@@ -883,12 +883,12 @@ def : InstRW<[N2Write_6cyc_2L, WriteLDHi], (instrs LDPQi, LDNPQi)>;
 
 // Load vector pair, immed post-index, S/D-form
 // Load vector pair, immed pre-index, S/D-form
-def : InstRW<[N2Write_6cyc_1I_1L, WriteLDHi, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_6cyc_1I_1L, WriteLDHi],
              (instregex "^LDP[SD](pre|post)$")>;
 
 // Load vector pair, immed post-index, Q-form
 // Load vector pair, immed pre-index, Q-form
-def : InstRW<[N2Write_6cyc_2I_2L, WriteLDHi, WriteAdr], (instrs LDPQpost,
+def : InstRW<[WriteAdr, N2Write_6cyc_2I_2L, WriteLDHi], (instrs LDPQpost,
                                                                 LDPQpre)>;
 
 // FP store instructions
@@ -1238,223 +1238,223 @@ def : InstRW<[N2Write_5cyc_1M0_1V], (instregex "^INSvi(8|16|32|64)gpr$")>;
 
 // ASIMD load, 1 element, multiple, 1 reg, D-form
 def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1Onev(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_6cyc_1L, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_6cyc_1L],
              (instregex "^LD1Onev(8b|4h|2s|1d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 1 reg, Q-form
 def : InstRW<[N2Write_6cyc_1L], (instregex "^LD1Onev(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_6cyc_1L, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_6cyc_1L],
              (instregex "^LD1Onev(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 2 reg, D-form
 def : InstRW<[N2Write_6cyc_2L], (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_6cyc_2L, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_6cyc_2L],
              (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 2 reg, Q-form
 def : InstRW<[N2Write_6cyc_2L], (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_6cyc_2L, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_6cyc_2L],
              (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 3 reg, D-form
 def : InstRW<[N2Write_6cyc_3L], (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_6cyc_3L, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_6cyc_3L],
              (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 3 reg, Q-form
 def : InstRW<[N2Write_6cyc_3L], (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_6cyc_3L, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_6cyc_3L],
              (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 4 reg, D-form
 def : InstRW<[N2Write_7cyc_4L], (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_7cyc_4L, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_7cyc_4L],
              (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 4 reg, Q-form
 def : InstRW<[N2Write_7cyc_4L], (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_7cyc_4L, WriteAdr],
+def : InstRW<[WriteAdr, N2Write_7cyc_4L],
              (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, one lane, B/H/S
 // ASIMD load, 1 element, one lane, D
 def : InstRW<[N2Write_8cyc_1L_1V],           (instregex "LD1i(8|16|32|64)$")>;
-def : InstRW<[N2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_1L_1V], (instregex "LD1i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 1 element, all lanes, D-form, B/H/S
 // ASIMD load, 1 element, all lanes, D-form, D
 def : InstRW<[N2Write_8cyc_1L_1V],           (instregex "LD1Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_1L_1V], (instregex "LD1Rv(8b|4h|2s|1d)_POST$")>;
 
 // ASIMD load, 1 element, all lanes, Q-form
 def : InstRW<[N2Write_8cyc_1L_1V],           (instregex "LD1Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_8cyc_1L_1V, WriteAdr], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_1L_1V], (instregex "LD1Rv(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 2 element, multiple, D-form, B/H/S
 def : InstRW<[N2Write_8cyc_1L_2V],           (instregex "LD2Twov(8b|4h|2s)$")>;
-def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_1L_2V], (instregex "LD2Twov(8b|4h|2s)_POST$")>;
 
 // ASIMD load, 2 element, multiple, Q-form, B/H/S
 // ASIMD load, 2 element, multiple, Q-form, D
 def : InstRW<[N2Write_8cyc_2L_2V],           (instregex "LD2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_8cyc_2L_2V, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_2L_2V], (instregex "LD2Twov(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 2 element, one lane, B/H
 // ASIMD load, 2 element, one lane, S
 // ASIMD load, 2 element, one lane, D
 def : InstRW<[N2Write_8cyc_1L_2V],           (instregex "LD2i(8|16|32|64)$")>;
-def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_1L_2V], (instregex "LD2i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 2 element, all lanes, D-form, B/H/S
 // ASIMD load, 2 element, all lanes, D-form, D
 def : InstRW<[N2Write_8cyc_1L_2V],            (instregex "LD2Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr],  (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_1L_2V],  (instregex "LD2Rv(8b|4h|2s|1d)_POST$")>;
 
 // ASIMD load, 2 element, all lanes, Q-form
 def : InstRW<[N2Write_8cyc_1L_2V],           (instregex "LD2Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_8cyc_1L_2V, WriteAdr], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_1L_2V], (instregex "LD2Rv(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 3 element, multiple, D-form, B/H/S
 def : InstRW<[N2Write_8cyc_2L_3V],           (instregex "LD3Threev(8b|4h|2s)$")>;
-def : InstRW<[N2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_2L_3V], (instregex "LD3Threev(8b|4h|2s)_POST$")>;
 
 // ASIMD load, 3 element, multiple, Q-form, B/H/S
 def : InstRW<[N2Write_8cyc_3L_3V],           (instregex "LD3Threev(16b|8h|4s)$")>;
-def : InstRW<[N2Write_8cyc_3L_3V, WriteAdr], (instregex "LD3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_3L_3V], (instregex "LD3Threev(16b|8h|4s)_POST$")>;
 
 // ASIMD load, 3 element, multiple, Q-form, D
 def : InstRW<[N2Write_8cyc_3L_3V],           (instregex "LD3Threev(2d)$")>;
-def : InstRW<[N2Write_8cyc_3L_3V, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_3L_3V], (instregex "LD3Threev(2d)_POST$")>;
 
 // ASIMD load, 3 element, one lane, B/H
 // ASIMD load, 3 element, one lane, S
 // ASIMD load, 3 element, one lane, D
 def : InstRW<[N2Write_8cyc_2L_3V],           (instregex "LD3i(8|16|32|64)$")>;
-def : InstRW<[N2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_2L_3V], (instregex "LD3i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 3 element, all lanes, D-form, B/H/S
 // ASIMD load, 3 element, all lanes, D-form, D
 def : InstRW<[N2Write_8cyc_2L_3V],           (instregex "LD3Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_8cyc_2L_3V, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_2L_3V], (instregex "LD3Rv(8b|4h|2s|1d)_POST$")>;
 
 // ASIMD load, 3 element, all lanes, Q-form, B/H/S
 // ASIMD load, 3 element, all lanes, Q-form, D
 def : InstRW<[N2Write_8cyc_3L_3V],           (instregex "LD3Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_8cyc_3L_3V, WriteAdr], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_3L_3V], (instregex "LD3Rv(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 4 element, multiple, D-form, B/H/S
 def : InstRW<[N2Write_8cyc_3L_4V],           (instregex "LD4Fourv(8b|4h|2s)$")>;
-def : InstRW<[N2Write_8cyc_3L_4V, WriteAdr], (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_3L_4V], (instregex "LD4Fourv(8b|4h|2s)_POST$")>;
 
 // ASIMD load, 4 element, multiple, Q-form, B/H/S
 // ASIMD load, 4 element, multiple, Q-form, D
 def : InstRW<[N2Write_9cyc_4L_4V],           (instregex "LD4Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_9cyc_4L_4V, WriteAdr], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_9cyc_4L_4V], (instregex "LD4Fourv(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 4 element, one lane, B/H
 // ASIMD load, 4 element, one lane, S
 // ASIMD load, 4 element, one lane, D
 def : InstRW<[N2Write_8cyc_3L_4V],           (instregex "LD4i(8|16|32|64)$")>;
-def : InstRW<[N2Write_8cyc_3L_4V, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_3L_4V], (instregex "LD4i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 4 element, all lanes, D-form, B/H/S
 // ASIMD load, 4 element, all lanes, D-form, D
 def : InstRW<[N2Write_8cyc_3L_4V],              (instregex "LD4Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_8cyc_3L_4V, WriteAdr],    (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_3L_4V],    (instregex "LD4Rv(8b|4h|2s|1d)_POST$")>;
 
 // ASIMD load, 4 element, all lanes, Q-form, B/H/S
 // ASIMD load, 4 element, all lanes, Q-form, D
 def : InstRW<[N2Write_8cyc_4L_4V],            (instregex "LD4Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_8cyc_4L_4V, WriteAdr],  (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_8cyc_4L_4V],  (instregex "LD4Rv(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store instructions
 // -----------------------------------------------------------------------------
 
 // ASIMD store, 1 element, multiple, 1 reg, D-form
 def : InstRW<[N2Write_2cyc_1L01_1V],           (instregex "ST1Onev(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_2cyc_1L01_1V, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_2cyc_1L01_1V], (instregex "ST1Onev(8b|4h|2s|1d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 1 reg, Q-form
 def : InstRW<[N2Write_2cyc_1L01_1V],           (instregex "ST1Onev(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_2cyc_1L01_1V, WriteAdr], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_2cyc_1L01_1V], (instregex "ST1Onev(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 2 reg, D-form
 def : InstRW<[N2Write_2cyc_1L01_1V],           (instregex "ST1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_2cyc_1L01_1V, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_2cyc_1L01_1V], (instregex "ST1Twov(8b|4h|2s|1d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 2 reg, Q-form
 def : InstRW<[N2Write_2cyc_2L01_2V],           (instregex "ST1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_2cyc_2L01_2V, WriteAdr], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_2cyc_2L01_2V], (instregex "ST1Twov(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 3 reg, D-form
 def : InstRW<[N2Write_2cyc_2L01_2V],           (instregex "ST1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_2cyc_2L01_2V, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_2cyc_2L01_2V], (instregex "ST1Threev(8b|4h|2s|1d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 3 reg, Q-form
 def : InstRW<[N2Write_2cyc_3L01_3V],           (instregex "ST1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_2cyc_3L01_3V, WriteAdr], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_2cyc_3L01_3V], (instregex "ST1Threev(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 4 reg, D-form
 def : InstRW<[N2Write_2cyc_2L01_2V],           (instregex "ST1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[N2Write_2cyc_2L01_2V, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_2cyc_2L01_2V], (instregex "ST1Fourv(8b|4h|2s|1d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 4 reg, Q-form
 def : InstRW<[N2Write_2cyc_4L01_4V],           (instregex "ST1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_2cyc_4L01_4V, WriteAdr], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_2cyc_4L01_4V], (instregex "ST1Fourv(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, one lane, B/H/S
 // ASIMD store, 1 element, one lane, D
 def : InstRW<[N2Write_4cyc_1L01_1V],           (instregex "ST1i(8|16|32|64)$")>;
-def : InstRW<[N2Write_4cyc_1L01_1V, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_4cyc_1L01_1V], (instregex "ST1i(8|16|32|64)_POST$")>;
 
 // ASIMD store, 2 element, multiple, D-form, B/H/S
 def : InstRW<[N2Write_4cyc_1L01_1V],           (instregex "ST2Twov(8b|4h|2s)$")>;
-def : InstRW<[N2Write_4cyc_1L01_1V, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_4cyc_1L01_1V], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
 
 // ASIMD store, 2 element, multiple, Q-form, B/H/S
 // ASIMD store, 2 element, multiple, Q-form, D
 def : InstRW<[N2Write_4cyc_2L01_2V],           (instregex "ST2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_4cyc_2L01_2V, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_4cyc_2L01_2V], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 2 element, one lane, B/H/S
 // ASIMD store, 2 element, one lane, D
 def : InstRW<[N2Write_4cyc_1L01_1V],           (instregex "ST2i(8|16|32|64)$")>;
-def : InstRW<[N2Write_4cyc_1L01_1V, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_4cyc_1L01_1V], (instregex "ST2i(8|16|32|64)_POST$")>;
 
 // ASIMD store, 3 element, multiple, D-form, B/H/S
 def : InstRW<[N2Write_5cyc_2L01_2V],           (instregex "ST3Threev(8b|4h|2s)$")>;
-def : InstRW<[N2Write_5cyc_2L01_2V, WriteAdr], (instregex "ST3Threev(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_5cyc_2L01_2V], (instregex "ST3Threev(8b|4h|2s)_POST$")>;
 
 // ASIMD store, 3 element, multiple, Q-form, B/H/S
 // ASIMD store, 3 element, multiple, Q-form, D
 def : InstRW<[N2Write_6cyc_3L01_3V],           (instregex "ST3Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_6cyc_3L01_3V], (instregex "ST3Threev(16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 3 element, one lane, B/H
 // ASIMD store, 3 element, one lane, S
 // ASIMD store, 3 element, one lane, D
 def : InstRW<[N2Write_6cyc_3L01_3V],           (instregex "ST3i(8|16|32|64)$")>;
-def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_6cyc_3L01_3V], (instregex "ST3i(8|16|32|64)_POST$")>;
 
 // ASIMD store, 4 element, multiple, D-form, B/H/S
 def : InstRW<[N2Write_6cyc_3L01_3V],           (instregex "ST4Fourv(8b|4h|2s)$")>;
-def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_6cyc_3L01_3V], (instregex "ST4Fourv(8b|4h|2s)_POST$")>;
 
 // ASIMD store, 4 element, multiple, Q-form, B/H/S
 def : InstRW<[N2Write_7cyc_6L01_6V],           (instregex "ST4Fourv(16b|8h|4s)$")>;
-def : InstRW<[N2Write_7cyc_6L01_6V, WriteAdr], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_7cyc_6L01_6V], (instregex "ST4Fourv(16b|8h|4s)_POST$")>;
 
 // ASIMD store, 4 element, multiple, Q-form, D
 def : InstRW<[N2Write_5cyc_4L01_4V],           (instregex "ST4Fourv(2d)$")>;
-def : InstRW<[N2Write_5cyc_4L01_4V, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_5cyc_4L01_4V], (instregex "ST4Fourv(2d)_POST$")>;
 
 // ASIMD store, 4 element, one lane, B/H/S
 def : InstRW<[N2Write_6cyc_3L01_3V],           (instregex "ST4i(8|16|32)$")>;
-def : InstRW<[N2Write_6cyc_3L01_3V, WriteAdr], (instregex "ST4i(8|16|32)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_6cyc_3L01_3V], (instregex "ST4i(8|16|32)_POST$")>;
 
 // ASIMD store, 4 element, one lane, D
 def : InstRW<[N2Write_4cyc_3L01_3V],            (instregex "ST4i(64)$")>;
-def : InstRW<[N2Write_4cyc_3L01_3V, WriteAdr],  (instregex "ST4i(64)_POST$")>;
+def : InstRW<[WriteAdr, N2Write_4cyc_3L01_3V],  (instregex "ST4i(64)_POST$")>;
 
 // Cryptography extensions
 // -----------------------------------------------------------------------------
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
index 7ee97a088ecba18..0c6ccc1face972f 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N2-writeback.s
@@ -1182,28 +1182,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    2.95
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,5]     D==================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D=======================eeeeeeER.   ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,9]     .D=============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1213,43 +1213,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.6   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.1    2.0       <total>
 
 # CHECK:      [1] Code Region - G02
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    2.95
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,5]     D==================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D=======================eeeeeeER.   ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .D=============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1259,43 +1259,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.6   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.1    2.0       <total>
 
 # CHECK:      [2] Code Region - G03
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1500
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    2.95
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     D==================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D=======================eeeeeeER.   ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .D=============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     D===eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1305,43 +1305,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.6   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.1    2.0       <total>
 
 # CHECK:      [3] Code Region - G04
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        1900
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.63
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    3.74
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D============eeeeeeER    .    .  .   ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     .D=================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D=======================eeeeeeER.   ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     .D=============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeeeER .   ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeeeeeER   ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     .D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1351,43 +1351,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    0.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.0    0.1    2.0       <total>
 
 # CHECK:      [4] Code Region - G05
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.67
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    3.94
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .  .   ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     .D=================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======================eeeeeeER.   ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     . D============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1397,43 +1397,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     29.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
 
 # CHECK:      [5] Code Region - G06
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.67
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    3.94
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 3.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .  .   ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .D=================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======================eeeeeeER.   ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     . D============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1443,43 +1443,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     29.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
 
 # CHECK:      [6] Code Region - G07
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2300
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.77
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    4.53
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 4.3
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .  .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,5]     .D=================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======================eeeeeeER.   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,9]     . D============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1489,43 +1489,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 9.     1     29.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
 
 # CHECK:      [7] Code Region - G08
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2500
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.83
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    4.92
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .  .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,5]     .D=================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======================eeeeeeER.   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,9]     . D============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1535,43 +1535,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 9.     1     29.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
 
 # CHECK:      [8] Code Region - G09
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2500
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.83
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    4.92
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .  .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .D=================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======================eeeeeeER.   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     . D============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER . .   ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1581,43 +1581,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     29.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
 
 # CHECK:      [9] Code Region - G10
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3204
+# CHECK-NEXT: Total Cycles:      608
 # CHECK-NEXT: Total uOps:        2700
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.84
-# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: uOps Per Cycle:    4.44
+# CHECK-NEXT: IPC:               1.64
 # CHECK-NEXT: Block RThroughput: 5.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .    .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     D============eER    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .    .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .D=================eER   .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=================eeeeeeeER  .    .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,7]     . D=======================eER .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D=======================eeeeeeeER.   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,9]     . D==============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .  .   ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER.  .   ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE----R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER  .   ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeER.   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,7]     . D==eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeER   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,9]     . D===eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1627,43 +1627,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     24.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 9.     1     31.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 9.     1     4.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.6    0.1    2.2       <total>
 
 # CHECK:      [10] Code Region - G11
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      675
 # CHECK-NEXT: Total uOps:        3000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.86
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    4.44
+# CHECK-NEXT: IPC:               1.48
 # CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.   .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eE-----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeER   .   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=eE-----R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeER  .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,5]     . D=eE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeER.   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D==eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeeeeeeER   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,9]     .   D==eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1673,43 +1673,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 9.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.2    2.5       <total>
 
 # CHECK:      [11] Code Region - G12
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3504
+# CHECK-NEXT: Total Cycles:      675
 # CHECK-NEXT: Total uOps:        3000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.86
-# CHECK-NEXT: IPC:               0.29
+# CHECK-NEXT: uOps Per Cycle:    4.44
+# CHECK-NEXT: IPC:               1.48
 # CHECK-NEXT: Block RThroughput: 6.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345678
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .  .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .  .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .  .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     . D===================eER.    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .  .   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=========================eER   .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D========================eeeeeeeER.   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D===============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.   .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,1]     D=eE-----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeER   .   ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE-----R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeER  .   ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE-----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeER.   ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeeeeeeER   ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1719,43 +1719,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     25.0   0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       ld1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.2    2.5       <total>
 
 # CHECK:      [12] Code Region - G13
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3604
+# CHECK-NEXT: Total Cycles:      1210
 # CHECK-NEXT: Total uOps:        2800
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.78
-# CHECK-NEXT: IPC:               0.28
+# CHECK-NEXT: uOps Per Cycle:    2.31
+# CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 5.7
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          01
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .   .   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=======eER    .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D======eeeeeeeER   .    .    .    .   .   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=============eER  .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D============eeeeeeeER .    .    .   .   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     . D===================eER.    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==================eeeeeeeER    .   .   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=========================eER   .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D=========================eeeeeeeeER.   ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,9]     .  D=================================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    .    ..   ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE-----R.    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeER    .    ..   ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE-----R    .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeeER   .    ..   ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE-----R   .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeER .    ..   ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE-----R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D========eeeeeeeeER   ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,9]     .  D=========eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1765,22 +1765,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     13.0   0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     20.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     19.0   0.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     26.0   0.0    0.0       ld1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     16.8   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       ld1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       ld1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     9.0    0.0    0.0       ld1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 9.     1     10.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.3    0.2    2.6       <total>
 
 # CHECK:      [13] Code Region - G14
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    10
@@ -1790,18 +1790,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,3]     D=========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    . .   ld1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,5]     .D================eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .D========================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER   ld1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,9]     . D===============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1811,22 +1811,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     25.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     32.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.7   0.1    3.0       <total>
 
 # CHECK:      [14] Code Region - G15
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    10
@@ -1836,18 +1836,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,3]     D=========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    . .   ld1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .D================eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D========================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER   ld1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,9]     . D===============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1857,43 +1857,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     25.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     32.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.7   0.1    3.0       <total>
 
 # CHECK:      [15] Code Region - G16
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      1203
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    1.66
+# CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER   .   ld1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1903,43 +1903,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld1r	{ v1.2d }, [x27], #8
-# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld1r	{ v1.2s }, [x27], #4
-# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld1r	{ v1.4h }, [x27], #2
-# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.1d }, [x27], #8
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.2d }, [x27], #8
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.2s }, [x27], #4
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1r	{ v1.4h }, [x27], #2
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
 
 # CHECK:      [16] Code Region - G17
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    3.92
+# CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1r	{ v1.4s }, [x27], #4
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER   .   ld1r	{ v1.4s }, [x27], #4
+# CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1949,43 +1949,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.4s }, [x27], #4
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
-# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld1r	{ v1.8h }, [x27], #2
-# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld1r	{ v1.16b }, [x27], #1
-# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld1r	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.8b }, [x27], #1
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.8h }, [x27], #2
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.16b }, [x27], #1
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1r	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
 
 # CHECK:      [17] Code Region - G18
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.50
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    3.92
+# CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1r	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER   .   ld1r	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -1995,43 +1995,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld1r	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld1r	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld1r	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld1r	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld1r	{ v1.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld1r	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
 
 # CHECK:      [18] Code Region - G19
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2400
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.60
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    4.71
+# CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld1r	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,7]     . D==============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER   .   ld1r	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,7]     . D==eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2041,43 +2041,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld1r	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 7.     1     31.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     20.1   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld1r	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.6    0.1    3.0       <total>
 
 # CHECK:      [19] Code Region - G20
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2900
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.72
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    5.69
+# CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D======================eeeeeeeeER.    .  .   ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,7]     . D==============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D=============================eeeeeeeeER.   ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .  D=====================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER   .   ld2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeeeER.   ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,7]     . D==eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D=eeeeeeeeER   ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .  D==eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2087,43 +2087,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     23.0   0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 7.     1     31.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     30.0   0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     38.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.7   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       ld2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.2    0.1    3.0       <total>
 
 # CHECK:      [20] Code Region - G21
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2700
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.67
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    5.29
+# CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     . D==============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER   .   ld2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     . D==eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2133,43 +2133,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     31.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     20.1   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.6    0.1    3.0       <total>
 
 # CHECK:      [21] Code Region - G22
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      3310
 # CHECK-NEXT: Total uOps:        2600
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.65
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.79
+# CHECK-NEXT: IPC:               0.30
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     . D==============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    . .   ld2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,5]     .D================eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     . D=======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER   ld2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     . D===============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2179,22 +2179,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 7.     1     31.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     24.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     20.0   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     32.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.5   0.1    3.0       <total>
 
 # CHECK:      [22] Code Region - G23
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        2500
 
 # CHECK:      Dispatch Width:    10
@@ -2204,18 +2204,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    . .   ld2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,3]     D=========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    . .   ld2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .D================eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    . .   ld2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .D========================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER   ld2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,9]     . D===============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2225,43 +2225,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     25.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     32.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     16.7   0.1    3.0       <total>
 
 # CHECK:      [23] Code Region - G24
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      2603
 # CHECK-NEXT: Total uOps:        2500
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.62
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    0.96
+# CHECK-NEXT: IPC:               0.38
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          012345678
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .  .   ld2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R    .    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .  .   ld2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,3]     D=========eE------R .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER .   ld2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .D================eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D================eeeeeeeeER.   ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,7]     .D=================eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D================eeeeeeeeER   ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     . D=================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2271,43 +2271,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     10.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
-# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+# CHECK-NEXT: 5.     1     17.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 7.     1     18.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     17.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     18.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     12.5   0.1    3.0       <total>
 
 # CHECK:      [24] Code Region - G25
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2500
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.62
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    4.90
+# CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2r	{ v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER   .   ld2r	{ v1.2s, v2.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2317,43 +2317,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], #8
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
-# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
-# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
-# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
-# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], #4
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], #8
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], #2
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], #4
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
 
 # CHECK:      [25] Code Region - G26
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      510
 # CHECK-NEXT: Total uOps:        2500
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.62
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    4.90
+# CHECK-NEXT: IPC:               1.96
 # CHECK-NEXT: Block RThroughput: 5.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2r	{ v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER   .   ld2r	{ v1.16b, v2.16b }, [x27], #2
+# CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2363,43 +2363,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], #2
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2r	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2r	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2r	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld2r	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
 
 # CHECK:      [26] Code Region - G27
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      609
 # CHECK-NEXT: Total uOps:        2800
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.70
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    4.60
+# CHECK-NEXT: IPC:               1.64
 # CHECK-NEXT: Block RThroughput: 5.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld2r	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D========eeeeeeeeER .    .    .    .    .  .   ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     D================eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===============eeeeeeeeER   .    .    .  .   ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .D=======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=======================eeeeeeeeER.    .  .   ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D===============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============================eeeeeeeeER.   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,9]     . D======================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER   .   ld2r	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeeeER  .   ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeeeER .   ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeeeER.   ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeeeER   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,9]     . D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2409,43 +2409,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld2r	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     9.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     20.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ld2r	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ld2r	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld2r	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    3.0       <total>
 
 # CHECK:      [27] Code Region - G28
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      759
 # CHECK-NEXT: Total uOps:        3700
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.92
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    4.87
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,1]     D=eE------R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   ..   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE------R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeeER ..   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,5]     . D==eE------R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeeER..   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,7]     .  D==eE------R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeeeER   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2455,43 +2455,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    3.0       <total>
 
 # CHECK:      [28] Code Region - G29
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      759
 # CHECK-NEXT: Total uOps:        3800
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.95
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    5.01
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,1]     D=eE------R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   ..   ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeeER ..   ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE------R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeeER..   ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE------R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeeeER   ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2501,43 +2501,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    3.0       <total>
 
 # CHECK:      [29] Code Region - G30
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      2010
 # CHECK-NEXT: Total uOps:        3700
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.92
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    1.84
+# CHECK-NEXT: IPC:               0.50
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .   .   ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R    .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   .    .    .   .   ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R   .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeeER .    .    .   .   ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE------R .    .    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D========eeeeeeeeER   .   .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,7]     .  D=========eE------R   .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D===============eeeeeeeeER   ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,9]     .   D================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2547,22 +2547,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     9.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 7.     1     10.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     16.0   0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 9.     1     17.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     6.3    0.2    3.0       <total>
 
 # CHECK:      [30] Code Region - G31
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        3500
 
 # CHECK:      Dispatch Width:    10
@@ -2572,18 +2572,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2593,22 +2593,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
 
 # CHECK:      [31] Code Region - G32
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        3500
 
 # CHECK:      Dispatch Width:    10
@@ -2618,18 +2618,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2639,43 +2639,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
 
 # CHECK:      [32] Code Region - G33
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      759
 # CHECK-NEXT: Total uOps:        3700
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.92
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    4.87
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     D=eE------R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   ..   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE------R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeeER ..   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: [0,5]     . D==eE------R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeeER..   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: [0,7]     .  D==eE------R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeeeER   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: [0,9]     .   D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2685,43 +2685,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], #12
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], #6
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], #12
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    3.0       <total>
 
 # CHECK:      [33] Code Region - G34
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      759
 # CHECK-NEXT: Total uOps:        3800
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.95
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    5.01
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
+# CHECK-NEXT: [0,1]     D=eE------R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   ..   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: [0,3]     .D=eE------R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeeER ..   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: [0,5]     . D==eE------R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeeER..   ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE------R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeeeER   ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2731,43 +2731,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], #3
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], #6
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], #3
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3r	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3r	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    3.0       <total>
 
 # CHECK:      [34] Code Region - G35
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      759
 # CHECK-NEXT: Total uOps:        3700
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.92
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    4.87
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    ..   ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   ..   ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeeER ..   ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE------R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeeeER..   ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE------R..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeeeER   ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2777,43 +2777,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld3r	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld3r	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ld3r	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       ld3r	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    3.0       <total>
 
 # CHECK:      [35] Code Region - G36
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4204
+# CHECK-NEXT: Total Cycles:      959
 # CHECK-NEXT: Total uOps:        4600
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.09
-# CHECK-NEXT: IPC:               0.24
+# CHECK-NEXT: uOps Per Cycle:    4.80
+# CHECK-NEXT: IPC:               1.04
 # CHECK-NEXT: Block RThroughput: 9.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          012345
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeeER.    .    .    .    .    .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,3]     .D================eER    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D===============eeeeeeeeER  .    .    .    .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,5]     . D=======================eER .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D======================eeeeeeeeER    .    .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D==============================eER   .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=============================eeeeeeeeeER.   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,9]     .   D======================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeeER  .  .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,3]     .D=eE-------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeeeER .  .   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,5]     . D==eE------R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==eeeeeeeeER  .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D===eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D===eeeeeeeeeER   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,9]     .   D====eE-------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2823,43 +2823,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld3r	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     23.0   0.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 7.     1     31.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     30.0   0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 9.     1     39.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.8   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 3.     1     2.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    1.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 9.     1     5.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.4    3.2       <total>
 
 # CHECK:      [36] Code Region - G37
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4304
+# CHECK-NEXT: Total Cycles:      1008
 # CHECK-NEXT: Total uOps:        4800
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.12
-# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: uOps Per Cycle:    4.76
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    ..   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeeER.    .    .    .    .    ..   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,3]     .D================eER    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D===============eeeeeeeeeER .    .    .    ..   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,5]     . D========================eER.    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=======================eeeeeeeeeER  .    ..   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D================================eER .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D===============================eeeeeeeeER.   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=======================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeeER  .  .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,3]     .D=eE-------R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeeeeeeER  .   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,5]     . D===eE-------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==eeeeeeeeeER .   ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D===eE-------R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeeeeeeeER   ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=====eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2869,43 +2869,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 5.     1     25.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     24.0   0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 7.     1     33.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     32.0   0.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     20.5   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 3.     1     2.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 5.     1     4.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    2.0    0.0       ld4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     6.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.5    3.3       <total>
 
 # CHECK:      [37] Code Region - G38
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4304
+# CHECK-NEXT: Total Cycles:      1009
 # CHECK-NEXT: Total uOps:        4800
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.12
-# CHECK-NEXT: IPC:               0.23
+# CHECK-NEXT: uOps Per Cycle:    4.76
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123456
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .    ..   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeeER.    .    .    .    .    ..   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D================eER    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D===============eeeeeeeeER  .    .    .    ..   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D=======================eER .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D======================eeeeeeeeeER   .    ..   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D===============================eER  .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D==============================eeeeeeeeeER.   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=======================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .   .   ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R    .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeeER  .   .   ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE-------R  .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeeeeeER.   .   ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D===eE------R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==eeeeeeeeeER  .   ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D===eE-------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeeeeeeeeER   ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=====eE-------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2915,22 +2915,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     16.0   0.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 5.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     23.0   0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 7.     1     32.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     31.0   0.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 9.     1     40.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     20.1   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    2.0    0.0       ld4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     6.0    0.0    7.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.5    3.3       <total>
 
 # CHECK:      [38] Code Region - G39
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        4500
 
 # CHECK:      Dispatch Width:    10
@@ -2940,18 +2940,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -2961,22 +2961,22 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
 
 # CHECK:      [39] Code Region - G40
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      4003
 # CHECK-NEXT: Total uOps:        4500
 
 # CHECK:      Dispatch Width:    10
@@ -2986,18 +2986,18 @@ add x0, x27, 1
 
 # CHECK:      Timeline view:
 # CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+# CHECK-NEXT: Index     0123456789          0123456789          012
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,1]     D=eE------R    .    .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D========eE------R .    .    .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    . .   ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,5]     . D===============eE------R   .    .    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    . .   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .  D======================eE------R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER   ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .   D=============================eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3007,43 +3007,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 5.     1     16.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 7.     1     23.0   0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 9.     1     30.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     15.5   0.1    3.0       <total>
 
 # CHECK:      [40] Code Region - G41
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      2103
 # CHECK-NEXT: Total uOps:        4600
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.15
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    2.19
+# CHECK-NEXT: IPC:               0.48
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0123456789
+# CHECK-NEXT: Index     0123456789          0123
+
+# CHECK:      [0,0]     DeeeeeeeeER    .    .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,1]     D=eE------R    .    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .  .   ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D========eE------R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=======eeeeeeeeER.  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,5]     . D========eE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=========eeeeeeeeER.   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D==========eE------R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=========eeeeeeeeER   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: [0,9]     .   D==========eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3053,43 +3053,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
 # CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 3.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     8.0    0.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 5.     1     9.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     10.0   2.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #32
+# CHECK-NEXT: 7.     1     11.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     10.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #16
+# CHECK-NEXT: 9.     1     11.0   0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     7.9    0.3    3.0       <total>
 
 # CHECK:      [41] Code Region - G42
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      1008
 # CHECK-NEXT: Total uOps:        4800
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.20
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    4.76
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
+# CHECK-NEXT: [0,1]     D=eE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   .  .   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: [0,3]     .D=eE------R   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeeeeeER.  .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: [0,5]     . D===eE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==eeeeeeeeER  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: [0,7]     .  D===eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: [0,9]     .   D=====eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3099,43 +3099,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #8
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #16
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #4
+# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #8
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    2.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #4
+# CHECK-NEXT: 9.     1     6.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.5    3.0       <total>
 
 # CHECK:      [42] Code Region - G43
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      4004
+# CHECK-NEXT: Total Cycles:      1008
 # CHECK-NEXT: Total uOps:        4700
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.17
-# CHECK-NEXT: IPC:               0.25
+# CHECK-NEXT: uOps Per Cycle:    4.66
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789          0123
-
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .    .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .    .  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D======================eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeeeER.    .  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=============================eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D============================eeeeeeeeER.   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D====================================eER   add	x0, x27, #1
+# CHECK-NEXT:                     012345678
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeER    .  .   ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R    .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER   .  .   ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R   .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeeeeeER.  .   ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D===eE------R.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==eeeeeeeeER  .   ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D===eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D====eeeeeeeeER   ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D=====eE------R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3145,43 +3145,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     30.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     29.0   0.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     37.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     19.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld4r	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ld4r	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     5.0    2.0    0.0       ld4r	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     6.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.5    3.0       <total>
 
 # CHECK:      [43] Code Region - G44
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3604
+# CHECK-NEXT: Total Cycles:      708
 # CHECK-NEXT: Total uOps:        3900
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.08
-# CHECK-NEXT: IPC:               0.28
+# CHECK-NEXT: uOps Per Cycle:    5.51
+# CHECK-NEXT: IPC:               1.41
 # CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456789
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeeER    .    .    .    .    .   .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D========eER   .    .    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=======eeeeeeeeER .    .    .    .   .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===============eER.    .    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==============eeeeeeeeER   .    .   .   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     . D======================eER  .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=====================eeeeeeER  .   .   ldp	s1, s2, [x27], #248
-# CHECK-NEXT: [0,7]     .  D===========================eER .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D===========================eeeeeeER.   ldp	d1, d2, [x27], #496
-# CHECK-NEXT: [0,9]     .  D=================================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeeER   .   ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE------R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeeER  .   ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE------R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeeeeeER   ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     . D===eE------R   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==eeeeeeE-R   ldp	s1, s2, [x27], #248
+# CHECK-NEXT: [0,7]     .  D===eE-----R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D===eeeeeeER   ldp	d1, d2, [x27], #496
+# CHECK-NEXT: [0,9]     .  D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3191,43 +3191,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ld4r	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     8.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     15.0   0.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     22.0   0.0    0.0       ldp	s1, s2, [x27], #248
-# CHECK-NEXT: 7.     1     28.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     28.0   0.0    0.0       ldp	d1, d2, [x27], #496
-# CHECK-NEXT: 9.     1     34.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     18.4   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ld4r	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       ld4r	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     4.0    0.0    6.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    1.0       ldp	s1, s2, [x27], #248
+# CHECK-NEXT: 7.     1     4.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       ldp	d1, d2, [x27], #496
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.9    0.3    2.8       <total>
 
 # CHECK:      [44] Code Region - G45
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2506
+# CHECK-NEXT: Total Cycles:      507
 # CHECK-NEXT: Total uOps:        2800
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.12
-# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: uOps Per Cycle:    5.52
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .   ldp	q1, q2, [x27], #992
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .   ldp	s1, s2, [x27, #248]!
-# CHECK-NEXT: [0,3]     .D===========eER    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .   ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: [0,5]     .D=================eER   .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D================eeeeeeER   .   ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: [0,7]     . D======================eER  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======================eeeeER   ldp	w1, w2, [x27], #248
-# CHECK-NEXT: [0,9]     .  D======================eE--R   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER ..   ldp	q1, q2, [x27], #992
+# CHECK-NEXT: [0,1]     D=eE----R ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER..   ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: [0,3]     .D=eE----R..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER.   ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D==eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeeER   ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: [0,7]     . D==eE----R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeE-R   ldp	w1, w2, [x27], #248
+# CHECK-NEXT: [0,9]     .  D==eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3237,43 +3237,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldp	q1, q2, [x27], #992
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ldp	d1, d2, [x27, #496]!
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     17.0   0.0    0.0       ldp	q1, q2, [x27, #992]!
-# CHECK-NEXT: 7.     1     23.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ldp	w1, w2, [x27], #248
-# CHECK-NEXT: 9.     1     23.0   0.0    2.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.2   0.1    0.2       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       ldp	s1, s2, [x27, #248]!
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldp	d1, d2, [x27, #496]!
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       ldp	q1, q2, [x27, #992]!
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    1.0       ldp	w1, w2, [x27], #248
+# CHECK-NEXT: 9.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.2    0.1    2.0       <total>
 
 # CHECK:      [45] Code Region - G46
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1304
+# CHECK-NEXT: Total Cycles:      1006
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.53
-# CHECK-NEXT: IPC:               0.77
+# CHECK-NEXT: uOps Per Cycle:    1.99
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 10.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
+# CHECK-NEXT:                     012345
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    ..   ldp	x1, x2, [x27], #496
-# CHECK-NEXT: [0,1]     D=eE--R   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  .    ..   ldp	w1, w2, [x27, #248]!
-# CHECK-NEXT: [0,3]     D==eE--R  .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeER .    ..   ldp	x1, x2, [x27, #496]!
-# CHECK-NEXT: [0,5]     .D==eE--R .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeeeeER    ..   ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: [0,7]     .D=======eER   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======eeeeeER.   ldpsw	x1, x2, [x27, #248]!
-# CHECK-NEXT: [0,9]     . D===========eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .    .   ldp	x1, x2, [x27], #496
+# CHECK-NEXT: [0,1]     D=eE--R   .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .    .   ldp	w1, w2, [x27, #248]!
+# CHECK-NEXT: [0,3]     D==eE--R  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .    .   ldp	x1, x2, [x27, #496]!
+# CHECK-NEXT: [0,5]     .D==eE--R .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeER    .   ldpsw	x1, x2, [x27], #248
+# CHECK-NEXT: [0,7]     .D===eE---R    .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D======eeeeeER   ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: [0,9]     . D=======eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3289,10 +3289,10 @@ add x0, x27, 1
 # CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldp	x1, x2, [x27, #496]!
 # CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldpsw	x1, x2, [x27], #248
-# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       ldpsw	x1, x2, [x27, #248]!
-# CHECK-NEXT: 9.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.3    0.1    0.6       <total>
+# CHECK-NEXT: 7.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     7.0    4.0    0.0       ldpsw	x1, x2, [x27, #248]!
+# CHECK-NEXT: 9.     1     8.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.5    0.5    1.2       <total>
 
 # CHECK:      [46] Code Region - G47
 
@@ -3344,28 +3344,28 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      508
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    0.67
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    3.94
+# CHECK-NEXT: IPC:               1.97
 # CHECK-NEXT: Block RThroughput: 3.8
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   ldr	b1, [x27, #254]!
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D======eeeeeeER.    .    .    .  .   ldr	h1, [x27, #254]!
-# CHECK-NEXT: [0,3]     D============eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===========eeeeeeER    .    .  .   ldr	s1, [x27, #254]!
-# CHECK-NEXT: [0,5]     .D=================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=================eeeeeeER   .  .   ldr	d1, [x27, #254]!
-# CHECK-NEXT: [0,7]     .D=======================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======================eeeeeeER.   ldr	q1, [x27, #254]!
-# CHECK-NEXT: [0,9]     . D============================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER . .   ldr	b1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE----R . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeeeER. .   ldr	h1, [x27, #254]!
+# CHECK-NEXT: [0,3]     D==eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   ldr	s1, [x27, #254]!
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeeeER.   ldr	d1, [x27, #254]!
+# CHECK-NEXT: [0,7]     .D===eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeeeER   ldr	q1, [x27, #254]!
+# CHECK-NEXT: [0,9]     . D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3375,16 +3375,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       ldr	b1, [x27, #254]!
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     7.0    0.0    0.0       ldr	h1, [x27, #254]!
-# CHECK-NEXT: 3.     1     13.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     12.0   0.0    0.0       ldr	s1, [x27, #254]!
-# CHECK-NEXT: 5.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     18.0   0.0    0.0       ldr	d1, [x27, #254]!
-# CHECK-NEXT: 7.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     23.0   0.0    0.0       ldr	q1, [x27, #254]!
-# CHECK-NEXT: 9.     1     29.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     15.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       ldr	h1, [x27, #254]!
+# CHECK-NEXT: 3.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       ldr	s1, [x27, #254]!
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       ldr	d1, [x27, #254]!
+# CHECK-NEXT: 7.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       ldr	q1, [x27, #254]!
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    2.0       <total>
 
 # CHECK:      [48] Code Region - G49
 
@@ -3528,28 +3528,27 @@ add x0, x27, 1
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      704
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        1700
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    2.41
-# CHECK-NEXT: IPC:               1.42
+# CHECK-NEXT: uOps Per Cycle:    3.37
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeeeER   .   ldrsh	x1, [x27, #254]!
-# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eeeeER  .   ldrsw	x1, [x27], #254
-# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     D==eeeeER .   ldrsw	x1, [x27, #254]!
-# CHECK-NEXT: [0,5]     D===eE--R .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D==eeE-R .   st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: [0,7]     .D====eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .D====eeER.   st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: [0,9]     .D======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER .   ldrsh	x1, [x27, #254]!
+# CHECK-NEXT: [0,1]     D=eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER.   ldrsw	x1, [x27], #254
+# CHECK-NEXT: [0,3]     D==eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     D==eeeeER   ldrsw	x1, [x27, #254]!
+# CHECK-NEXT: [0,5]     D===eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeE-R   st1	{ v1.1d }, [x27], #8
+# CHECK-NEXT: [0,7]     .D===eE-R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .D===eeER   st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: [0,9]     .D====eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3565,37 +3564,36 @@ add x0, x27, 1
 # CHECK-NEXT: 4.     1     3.0    0.0    0.0       ldrsw	x1, [x27, #254]!
 # CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
 # CHECK-NEXT: 6.     1     3.0    0.0    1.0       st1	{ v1.1d }, [x27], #8
-# CHECK-NEXT: 7.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
-# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     3.5    0.1    0.7       <total>
+# CHECK-NEXT: 7.     1     4.0    0.0    1.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.2d }, [x27], #16
+# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.1    0.8       <total>
 
 # CHECK:      [52] Code Region - G53
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.99
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    3.97
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.  .   st1	{ v1.2s }, [x27], #8
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3605,43 +3603,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s }, [x27], #8
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
-# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
-# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
-# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4h }, [x27], #8
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.4s }, [x27], #16
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.8b }, [x27], #8
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.8h }, [x27], #16
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [53] Code Region - G54
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.99
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    3.97
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.  .   st1	{ v1.16b }, [x27], #16
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3651,43 +3648,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b }, [x27], #16
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
-# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
-# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
-# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.1d }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.2d }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.2s }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.4h }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [54] Code Region - G55
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.99
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    3.97
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.  .   st1	{ v1.4s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3697,43 +3693,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s }, [x27], x28
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
-# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
-# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
-# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8b }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.8h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.16b }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], #16
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [55] Code Region - G56
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2400
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    2.39
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    4.76
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.  .   st1	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3743,43 +3738,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [56] Code Region - G57
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2600
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    2.59
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    5.16
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D====eeER  .   st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,7]     . D======eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.  .   st1	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER  .   st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeER.   st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,7]     . D==eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3789,43 +3783,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
-# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     5.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 7.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.8    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.1d, v2.1d }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    0.0       <total>
 
 # CHECK:      [57] Code Region - G58
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      504
 # CHECK-NEXT: Total uOps:        2600
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    2.59
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    5.16
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
-# CHECK-NEXT: Index     0123456789
+# CHECK-NEXT: Index     012345678
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D==eeER   .  .   st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,3]     D====eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D===eeER .  .   st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,5]     .D=====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D=====eeER  .   st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,7]     .D=======eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D======eeER.   st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,9]     . D========eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.  .   st1	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eER.  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeER  .   st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeER .   st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeER.   st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,7]     .D===eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeER   st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3835,43 +3828,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     3.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 3.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     4.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 5.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     6.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 7.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     7.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 9.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     5.2    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    0.0       <total>
 
 # CHECK:      [58] Code Region - G59
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        3400
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    3.39
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    4.84
+# CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.   .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
+# CHECK-NEXT: [0,1]     D=eER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER   .   st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,3]     .D=eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,5]     . D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeER.   st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,7]     .  D==eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeER   st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,9]     .   D==eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3881,43 +3873,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], #24
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.2    0.0       <total>
 
 # CHECK:      [59] Code Region - G60
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        3600
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    3.59
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    5.12
+# CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 6.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.   .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,1]     D=eER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER   .   st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,3]     .D=eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,5]     . D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeER.   st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeER   st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3927,43 +3918,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
-# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    1.0    0.0       st1	{ v1.1d, v2.1d, v3.1d }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.9    0.2    0.0       <total>
 
 # CHECK:      [60] Code Region - G61
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        3400
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    3.39
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    4.84
+# CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.   .   st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER   .   st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeER  .   st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeER .   st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeER   st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -3973,43 +3963,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.7    0.2    0.0       <total>
 
 # CHECK:      [61] Code Region - G62
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      704
 # CHECK-NEXT: Total uOps:        3600
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    3.59
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    5.11
+# CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 6.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
+# CHECK-NEXT:                     0
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.    .   st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eER.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER    .   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=eER    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeER  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,5]     . D==eER  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeER .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,7]     .  D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeER   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,9]     .   D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4019,43 +4009,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], #32
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    0.0       <total>
 
 # CHECK:      [62] Code Region - G63
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      804
 # CHECK-NEXT: Total uOps:        4200
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    4.18
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    5.22
+# CHECK-NEXT: IPC:               1.24
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.    ..   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,1]     D=eER.    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER    ..   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,3]     .D=eER    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeER  ..   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,5]     . D==eER  ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeER ..   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,7]     .  D==eER ..   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D===eeER   st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D====eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4065,43 +4055,42 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
-# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    2.0    0.0       st1	{ v1.1d, v2.1d, v3.1d, v4.1d }, [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.5    0.4    0.0       <total>
 
 # CHECK:      [63] Code Region - G64
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1004
+# CHECK-NEXT: Total Cycles:      703
 # CHECK-NEXT: Total uOps:        3800
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    3.78
-# CHECK-NEXT: IPC:               1.00
+# CHECK-NEXT: uOps Per Cycle:    5.41
+# CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    .  .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     D==eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=eeER   .  .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==eeER .  .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     . D====eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===eeER  .   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D=====eER .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====eeER.   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D======eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.   .   st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eER.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER   .   st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeER .   st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D==eER .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeER.   st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eER.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeER   st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4111,43 +4100,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     5.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     4.0    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st1	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st1	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st1	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.1    0.2    0.0       <total>
 
 # CHECK:      [64] Code Region - G65
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1604
+# CHECK-NEXT: Total Cycles:      706
 # CHECK-NEXT: Total uOps:        3200
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    2.00
-# CHECK-NEXT: IPC:               0.62
+# CHECK-NEXT: uOps Per Cycle:    4.53
+# CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 5.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
+# CHECK-NEXT:                     012
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeER.    .    .   .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     D==eER    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=eeER   .    .   .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .D===eER  .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==eeeeER    .   .   st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: [0,5]     . D======eER   .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D======eeeeER.   .   st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: [0,7]     . D==========eER   .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D=========eeeeER.   st1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .  D=============eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeER.    . .   st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeER    . .   st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eER    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeER .   st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: [0,5]     . D===eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D===eeeeER.   st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: [0,7]     . D====eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D===eeeeER   st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .  D====eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4157,43 +4146,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     3.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     4.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     3.0    0.0    0.0       st1	{ v1.b }[0], [x27], #1
-# CHECK-NEXT: 5.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     7.0    0.0    0.0       st1	{ v1.b }[8], [x27], #1
-# CHECK-NEXT: 7.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     10.0   0.0    0.0       st1	{ v1.b }[0], [x27], x28
-# CHECK-NEXT: 9.     1     14.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     6.2    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st1	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       st1	{ v1.b }[0], [x27], #1
+# CHECK-NEXT: 5.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    0.0    0.0       st1	{ v1.b }[8], [x27], #1
+# CHECK-NEXT: 7.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st1	{ v1.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     3.1    0.3    0.6       <total>
 
 # CHECK:      [65] Code Region - G66
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    3.95
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .    .  .   st1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============eeeeER.   st1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   st1	{ v1.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4203,43 +4192,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.b }[8], [x27], x28
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
-# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
-# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st1	{ v1.h }[0], [x27], x28
-# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st1	{ v1.h }[4], [x27], x28
-# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.h }[0], [x27], #2
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.h }[4], [x27], #2
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st1	{ v1.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [66] Code Region - G67
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2200
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.10
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    4.35
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .    .  .   st1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============eeeeER.   st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   st1	{ v1.s }[0], [x27], #4
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4249,43 +4238,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st1	{ v1.s }[0], [x27], #4
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
-# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
-# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st1	{ v1.d }[0], [x27], x28
-# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
-# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st1	{ v1.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st1	{ v1.d }[0], [x27], #8
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st1	{ v1.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], #32
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [67] Code Region - G68
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2400
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.20
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    4.74
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============eeeeER.   st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   st2	{ v1.2s, v2.2s }, [x27], #16
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4295,43 +4284,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.2s, v2.2s }, [x27], #16
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
-# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
-# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
-# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
-# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], #16
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.8b, v2.8b }, [x27], #16
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], #32
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [68] Code Region - G69
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2600
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.30
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    5.14
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 4.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D===eeeeER    .    .  .   st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=======eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D==========eeeeER .  .   st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: [0,7]     . D==============eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============eeeeER.   st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   st2	{ v1.16b, v2.16b }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeER  .   st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeER.   st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: [0,7]     . D==eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4341,43 +4330,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.16b, v2.16b }, [x27], #32
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     4.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
-# CHECK-NEXT: 3.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
-# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     11.0   0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
-# CHECK-NEXT: 7.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
-# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     9.8    0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st2	{ v1.2d, v2.2d }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.2s, v2.2s }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st2	{ v1.4h, v2.4h }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.4s, v2.4s }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.1    1.0       <total>
 
 # CHECK:      [69] Code Region - G70
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2400
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.20
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    4.74
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 3.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============eeeeER.   st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   st2	{ v1.8b, v2.8b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4387,43 +4376,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.8b, v2.8b }, [x27], x28
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
-# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
-# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
-# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
-# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.8h, v2.8h }, [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.16b, v2.16b }, [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.b, v2.b }[0], [x27], #2
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], #2
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [70] Code Region - G71
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    3.95
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============eeeeER.   st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   st2	{ v1.b, v2.b }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4433,43 +4422,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.b, v2.b }[0], [x27], x28
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
-# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
-# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
-# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
-# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.b, v2.b }[8], [x27], x28
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], #4
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.h, v2.h }[4], [x27], #4
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.h, v2.h }[0], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [71] Code Region - G72
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2004
+# CHECK-NEXT: Total Cycles:      506
 # CHECK-NEXT: Total uOps:        2000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.00
-# CHECK-NEXT: IPC:               0.50
+# CHECK-NEXT: uOps Per Cycle:    3.95
+# CHECK-NEXT: IPC:               1.98
 # CHECK-NEXT: Block RThroughput: 2.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          0123
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeER   .    .    .  .   st2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: [0,1]     D====eER  .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D====eeeeER    .    .  .   st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: [0,3]     D========eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=======eeeeER.    .  .   st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: [0,5]     .D===========eER    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .D===========eeeeER .  .   st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: [0,7]     .D===============eER.  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     . D==============eeeeER.   st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: [0,9]     . D==================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeER   .   st2	{ v1.h, v2.h }[4], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE--R   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eeeeER  .   st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: [0,3]     D==eE--R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeER .   st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: [0,5]     .D==eE--R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .D==eeeeER.   st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: [0,7]     .D===eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     . D==eeeeER   st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: [0,9]     . D===eE--R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4479,43 +4468,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st2	{ v1.h, v2.h }[4], [x27], x28
-# CHECK-NEXT: 1.     1     5.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
-# CHECK-NEXT: 3.     1     9.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     8.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
-# CHECK-NEXT: 5.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     12.0   0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
-# CHECK-NEXT: 7.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     15.0   0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
-# CHECK-NEXT: 9.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     10.2   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], #8
+# CHECK-NEXT: 3.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    0.0    0.0       st2	{ v1.s, v2.s }[0], [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], #16
+# CHECK-NEXT: 7.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    0.0    0.0       st2	{ v1.d, v2.d }[0], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.7    0.1    1.0       <total>
 
 # CHECK:      [72] Code Region - G73
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      1804
+# CHECK-NEXT: Total Cycles:      607
 # CHECK-NEXT: Total uOps:        2800
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.55
-# CHECK-NEXT: IPC:               0.55
+# CHECK-NEXT: uOps Per Cycle:    4.61
+# CHECK-NEXT: IPC:               1.65
 # CHECK-NEXT: Block RThroughput: 4.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeER .    .    .    ..   st2g	x26, [x27], #4064
-# CHECK-NEXT: [0,1]     D=eER.    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     D=eER.    .    .    ..   st2g	x26, [x27, #4064]!
-# CHECK-NEXT: [0,3]     D==eER    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .D=eeeeeeER    .    ..   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: [0,5]     .D=======eER   .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     . D======eeeeeER    ..   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: [0,7]     . D===========eER   ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .  D==========eeeeeER.   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: [0,9]     .  D===============eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeER .    . .   st2g	x26, [x27], #4064
+# CHECK-NEXT: [0,1]     D=eER.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     D=eER.    . .   st2g	x26, [x27, #4064]!
+# CHECK-NEXT: [0,3]     D==eER    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .D=eeeeeeER .   st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
+# CHECK-NEXT: [0,5]     .D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     . D=eeeeeER .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: [0,7]     . D==eE---R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .  D==eeeeeER   st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: [0,9]     .  D===eE---R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4529,39 +4518,39 @@ add x0, x27, 1
 # CHECK-NEXT: 2.     1     2.0    0.0    0.0       st2g	x26, [x27, #4064]!
 # CHECK-NEXT: 3.     1     3.0    0.0    0.0       add	x0, x27, #1
 # CHECK-NEXT: 4.     1     2.0    0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], #48
-# CHECK-NEXT: 5.     1     8.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     7.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
-# CHECK-NEXT: 7.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     11.0   0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
-# CHECK-NEXT: 9.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     6.4    0.1    0.0       <total>
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], #24
+# CHECK-NEXT: 7.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], #24
+# CHECK-NEXT: 9.     1     4.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.5    0.2    1.0       <total>
 
 # CHECK:      [73] Code Region - G74
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2904
+# CHECK-NEXT: Total Cycles:      708
 # CHECK-NEXT: Total uOps:        3800
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.31
-# CHECK-NEXT: IPC:               0.34
+# CHECK-NEXT: uOps Per Cycle:    5.37
+# CHECK-NEXT: IPC:               1.41
 # CHECK-NEXT: Block RThroughput: 7.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    . .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeER .    .    .    . .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: [0,3]     .D==========eER.    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=========eeeeeeER.    .    . .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: [0,5]     . D===============eER    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==============eeeeeeER    . .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: [0,7]     .  D====================eER   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D===================eeeeeeER.   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .   .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
+# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER .   .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE---R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeER .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: [0,7]     .  D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeER   st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4571,43 +4560,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], #48
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
-# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     10.0   0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
-# CHECK-NEXT: 5.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     15.0   0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
-# CHECK-NEXT: 7.     1     21.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     20.0   0.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
-# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.3   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], #48
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], #48
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st3	{ v1.2d, v2.2d, v3.2d }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    1.9       <total>
 
 # CHECK:      [74] Code Region - G75
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2704
+# CHECK-NEXT: Total Cycles:      707
 # CHECK-NEXT: Total uOps:        3400
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.26
-# CHECK-NEXT: IPC:               0.37
+# CHECK-NEXT: uOps Per Cycle:    4.81
+# CHECK-NEXT: IPC:               1.41
 # CHECK-NEXT: Block RThroughput: 6.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D====eeeeeER  .    .    .    .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: [0,3]     .D=========eER .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D========eeeeeeER .    .    .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: [0,5]     . D==============eER.    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=============eeeeeER .    .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: [0,7]     .  D==================eER.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D=================eeeeeeER.   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: [0,9]     .   D=======================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  .  .   st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE---R  .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeER .  .   st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE---R .  .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . DeeeeeeER  .   st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: [0,5]     . D=eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  DeeeeeER  .   st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: [0,7]     .  D=eE---R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D=eeeeeeER   st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: [0,9]     .   D==eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4617,43 +4606,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.2s, v2.2s, v3.2s }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
-# CHECK-NEXT: 3.     1     10.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     9.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
-# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     14.0   0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
-# CHECK-NEXT: 7.     1     19.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     18.0   0.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
-# CHECK-NEXT: 9.     1     24.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     12.1   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.4h, v2.4h, v3.4h }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    0.0    0.0       st3	{ v1.4s, v2.4s, v3.4s }, [x27], x28
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     1.0    0.0    0.0       st3	{ v1.8b, v2.8b, v3.8b }, [x27], x28
+# CHECK-NEXT: 7.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    1.0    0.0       st3	{ v1.8h, v2.8h, v3.8h }, [x27], x28
+# CHECK-NEXT: 9.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.7    0.2    1.7       <total>
 
 # CHECK:      [75] Code Region - G76
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      757
 # CHECK-NEXT: Total uOps:        4000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.33
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    5.28
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .  .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: [0,3]     .D===========eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .  .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: [0,5]     . D================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .  .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .  D=====================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====================eeeeeeER.   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: [0,9]     .   D==========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .   .   st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.   .   st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: [0,3]     .D=eE----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeER .   st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeER   st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4663,43 +4652,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.16b, v2.16b, v3.16b }, [x27], x28
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
-# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     16.0   0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
-# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     21.0   0.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
-# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], #3
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], #3
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.b, v2.b, v3.b }[0], [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st3	{ v1.b, v2.b, v3.b }[8], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    2.0       <total>
 
 # CHECK:      [76] Code Region - G77
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      757
 # CHECK-NEXT: Total uOps:        4000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.33
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    5.28
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: [0,3]     .D===========eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .  .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: [0,7]     .  D=====================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====================eeeeeeER.   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: [0,9]     .   D==========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .   .   st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
+# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.   .   st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: [0,3]     .D=eE----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeER .   st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeER   st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4709,43 +4698,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], #6
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
-# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     16.0   0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
-# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     21.0   0.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
-# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], #6
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.h, v2.h, v3.h }[0], [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st3	{ v1.h, v2.h, v3.h }[4], [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], #12
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    2.0       <total>
 
 # CHECK:      [77] Code Region - G78
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2904
+# CHECK-NEXT: Total Cycles:      807
 # CHECK-NEXT: Total uOps:        4200
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.45
-# CHECK-NEXT: IPC:               0.34
+# CHECK-NEXT: uOps Per Cycle:    5.20
+# CHECK-NEXT: IPC:               1.24
 # CHECK-NEXT: Block RThroughput: 8.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    . .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    . .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: [0,3]     .D===========eER    .    .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    . .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: [0,5]     . D================eER   .    . .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===============eeeeeER    . .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: [0,7]     .  D====================eER   . .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D===================eeeeeeER.   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: [0,9]     .   D=========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .   .   st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.   .   st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: [0,3]     .D=eE----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeER  .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: [0,7]     .  D==eE---R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeER   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4755,43 +4744,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st3	{ v1.s, v2.s, v3.s }[0], [x27], x28
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
-# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     16.0   0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
-# CHECK-NEXT: 7.     1     21.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     20.0   0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
-# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.7   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], #24
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st3	{ v1.d, v2.d, v3.d }[0], [x27], x28
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], #64
+# CHECK-NEXT: 7.     1     3.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], #32
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    1.9       <total>
 
 # CHECK:      [78] Code Region - G79
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3304
+# CHECK-NEXT: Total Cycles:      1205
 # CHECK-NEXT: Total uOps:        5800
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.76
-# CHECK-NEXT: IPC:               0.30
+# CHECK-NEXT: uOps Per Cycle:    4.81
+# CHECK-NEXT: IPC:               0.83
 # CHECK-NEXT: Block RThroughput: 12.0
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123456
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .    ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeeeER    .    .    .    ..   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: [0,3]     . D===========eER   .    .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .  D==========eeeeeeER   .    .    ..   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: [0,5]     .  D================eER  .    .    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .   D===============eeeeeeeER .    ..   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: [0,7]     .    D=====================eER.    ..   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .D====================eeeeeeeER.   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: [0,9]     .    . D==========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .    ..   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
+# CHECK-NEXT: [0,1]     D=eE----R .    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeeER    ..   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: [0,3]     . DeE-----R    ..   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .  DeeeeeeER   ..   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: [0,5]     .  D=eE----R   ..   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .   D==eeeeeeeER.   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: [0,7]     .    D==eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D=eeeeeeeER   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: [0,9]     .    . D=eE-----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4801,43 +4790,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], #32
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
-# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     16.0   0.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
-# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     21.0   0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
-# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], #64
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     1.0    1.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], #32
+# CHECK-NEXT: 5.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    2.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], #64
+# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     2.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], #64
+# CHECK-NEXT: 9.     1     2.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT:        1     1.8    0.4    2.3       <total>
 
 # CHECK:      [79] Code Region - G80
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      1006
 # CHECK-NEXT: Total uOps:        4800
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.60
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    4.77
+# CHECK-NEXT: IPC:               0.99
 # CHECK-NEXT: Block RThroughput: 9.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeER  .    .    .    .    .  .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: [0,1]     D=====eER .    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D====eeeeeeER .    .    .    .  .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: [0,3]     .D==========eER.    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D=========eeeeeeER.    .    .  .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: [0,5]     . D===============eER    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D==============eeeeeeeER   .  .   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: [0,7]     .   D====================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    D===================eeeeeeER.   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: [0,9]     .    D=========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeER  .    .   st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
+# CHECK-NEXT: [0,1]     D=eE---R  .    .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.    .   st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R.    .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D==eeeeeeER  .   st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: [0,5]     . D===eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D==eeeeeeeER.   st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: [0,7]     .   D==eE-----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    D==eeeeeeER   st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: [0,9]     .    D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4847,43 +4836,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.2d, v2.2d, v3.2d, v4.2d }, [x27], x28
-# CHECK-NEXT: 1.     1     6.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     5.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
-# CHECK-NEXT: 3.     1     11.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     10.0   0.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
-# CHECK-NEXT: 5.     1     16.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     15.0   0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
-# CHECK-NEXT: 7.     1     21.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     20.0   0.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
-# CHECK-NEXT: 9.     1     26.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     13.1   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    3.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.2s, v2.2s, v3.2s, v4.2s }, [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    2.0    0.0       st4	{ v1.4h, v2.4h, v3.4h, v4.4h }, [x27], x28
+# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     3.0    0.0    0.0       st4	{ v1.4s, v2.4s, v3.4s, v4.4s }, [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st4	{ v1.8b, v2.8b, v3.8b, v4.8b }, [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.6    0.4    2.0       <total>
 
 # CHECK:      [80] Code Region - G81
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3204
+# CHECK-NEXT: Total Cycles:      1058
 # CHECK-NEXT: Total uOps:        5200
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.62
-# CHECK-NEXT: IPC:               0.31
+# CHECK-NEXT: uOps Per Cycle:    4.91
+# CHECK-NEXT: IPC:               0.95
 # CHECK-NEXT: Block RThroughput: 10.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          012345
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01234567
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeeER.    .    .    .    .    .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: [0,1]     .D======eER    .    .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     . D=====eeeeeeeER   .    .    .    .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: [0,3]     .  D===========eER  .    .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     .   D==========eeeeeeER  .    .    .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: [0,5]     .   D================eER .    .    .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .    D===============eeeeeeER .    .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: [0,7]     .    D=====================eER.    .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .    .D====================eeeeeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: [0,9]     .    .D==========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeeER.    . .   st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
+# CHECK-NEXT: [0,1]     .DeE-----R.    . .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     . DeeeeeeeER   . .   st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: [0,3]     .  DeE-----R   . .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     .   D==eeeeeeER. .   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: [0,5]     .   D===eE----R. .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .    D===eeeeeeER.   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: [0,7]     .    D====eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .    .D===eeeeeeER   st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: [0,9]     .    .D====eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4893,43 +4882,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.8h, v2.8h, v3.8h, v4.8h }, [x27], x28
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
-# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     16.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
-# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     21.0   0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
-# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    1.0    0.0       st4	{ v1.16b, v2.16b, v3.16b, v4.16b }, [x27], x28
+# CHECK-NEXT: 3.     1     1.0    0.0    5.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     3.0    3.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], #4
+# CHECK-NEXT: 5.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     4.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], #4
+# CHECK-NEXT: 7.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     4.0    0.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[0], [x27], x28
+# CHECK-NEXT: 9.     1     5.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.9    0.6    2.2       <total>
 
 # CHECK:      [81] Code Region - G82
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      3004
+# CHECK-NEXT: Total Cycles:      757
 # CHECK-NEXT: Total uOps:        4000
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.33
-# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: uOps Per Cycle:    5.28
+# CHECK-NEXT: IPC:               1.32
 # CHECK-NEXT: Block RThroughput: 7.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789          0123
-# CHECK-NEXT: Index     0123456789          0123456789
+# CHECK-NEXT:                     01234
+# CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeeeeeeER .    .    .    .    .  .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: [0,1]     D======eER.    .    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .    .    .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: [0,3]     .D===========eER    .    .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==========eeeeeeER    .    .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: [0,5]     . D================eER   .    .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D===============eeeeeeER   .  .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .  D=====================eER  .  .   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D====================eeeeeeER.   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: [0,9]     .   D==========================eER   add	x0, x27, #1
+# CHECK:      [0,0]     DeeeeeeER .   .   st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
+# CHECK-NEXT: [0,1]     D=eE----R .   .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.   .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: [0,3]     .D=eE----R.   .   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeeeER  .   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: [0,5]     . D==eE----R  .   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeeeER .   st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eeeeeeER   st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: [0,9]     .   D===eE----R   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4939,43 +4928,43 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.b, v2.b, v3.b, v4.b }[8], [x27], x28
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
-# CHECK-NEXT: 5.     1     17.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     16.0   0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
-# CHECK-NEXT: 7.     1     22.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     21.0   0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
-# CHECK-NEXT: 9.     1     27.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     14.0   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], #8
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], #8
+# CHECK-NEXT: 5.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[0], [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    0.0       st4	{ v1.h, v2.h, v3.h, v4.h }[4], [x27], x28
+# CHECK-NEXT: 9.     1     4.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    2.0       <total>
 
 # CHECK:      [82] Code Region - G83
 
 # CHECK:      Iterations:        100
 # CHECK-NEXT: Instructions:      1000
-# CHECK-NEXT: Total Cycles:      2104
+# CHECK-NEXT: Total Cycles:      704
 # CHECK-NEXT: Total uOps:        3600
 
 # CHECK:      Dispatch Width:    10
-# CHECK-NEXT: uOps Per Cycle:    1.71
-# CHECK-NEXT: IPC:               0.48
+# CHECK-NEXT: uOps Per Cycle:    5.11
+# CHECK-NEXT: IPC:               1.42
 # CHECK-NEXT: Block RThroughput: 6.5
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456789
-# CHECK-NEXT: Index     0123456789          01234
-
-# CHECK:      [0,0]     DeeeeeeER .    .    .   .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: [0,1]     D======eER.    .    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,2]     .D=====eeeeeeER.    .   .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: [0,3]     .D===========eER    .   .   add	x0, x27, #1
-# CHECK-NEXT: [0,4]     . D==========eeeeER .   .   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: [0,5]     . D==============eER.   .   add	x0, x27, #1
-# CHECK-NEXT: [0,6]     .  D=============eeeeER .   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: [0,7]     .  D=================eER.   add	x0, x27, #1
-# CHECK-NEXT: [0,8]     .   D================eER.   stg	x26, [x27], #4064
-# CHECK-NEXT: [0,9]     .   D=================eER   add	x0, x27, #1
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
+# CHECK-NEXT: [0,1]     D=eE----R .   add	x0, x27, #1
+# CHECK-NEXT: [0,2]     .DeeeeeeER.   st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: [0,3]     .D=eE----R.   add	x0, x27, #1
+# CHECK-NEXT: [0,4]     . D=eeeeER.   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: [0,5]     . D==eE--R.   add	x0, x27, #1
+# CHECK-NEXT: [0,6]     .  D=eeeeER   st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: [0,7]     .  D==eE--R   add	x0, x27, #1
+# CHECK-NEXT: [0,8]     .   D==eE-R   stg	x26, [x27], #4064
+# CHECK-NEXT: [0,9]     .   D===eER   add	x0, x27, #1
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -4985,16 +4974,16 @@ add x0, x27, 1
 
 # CHECK:            [0]    [1]    [2]    [3]
 # CHECK-NEXT: 0.     1     1.0    1.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], #16
-# CHECK-NEXT: 1.     1     7.0    0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 2.     1     6.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
-# CHECK-NEXT: 3.     1     12.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
-# CHECK-NEXT: 5.     1     15.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 6.     1     14.0   0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
-# CHECK-NEXT: 7.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT: 8.     1     17.0   0.0    0.0       stg	x26, [x27], #4064
-# CHECK-NEXT: 9.     1     18.0   0.0    0.0       add	x0, x27, #1
-# CHECK-NEXT:        1     11.9   0.1    0.0       <total>
+# CHECK-NEXT: 1.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 2.     1     1.0    0.0    0.0       st4	{ v1.s, v2.s, v3.s, v4.s }[0], [x27], x28
+# CHECK-NEXT: 3.     1     2.0    0.0    4.0       add	x0, x27, #1
+# CHECK-NEXT: 4.     1     2.0    1.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], #32
+# CHECK-NEXT: 5.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 6.     1     2.0    0.0    0.0       st4	{ v1.d, v2.d, v3.d, v4.d }[0], [x27], x28
+# CHECK-NEXT: 7.     1     3.0    0.0    2.0       add	x0, x27, #1
+# CHECK-NEXT: 8.     1     3.0    1.0    1.0       stg	x26, [x27], #4064
+# CHECK-NEXT: 9.     1     4.0    0.0    0.0       add	x0, x27, #1
+# CHECK-NEXT:        1     2.3    0.3    1.3       <total>
 
 # CHECK:      [83] Code Region - G84
 

>From 269402ba4d33767b7d8830b4ce0a44d7681b258f Mon Sep 17 00:00:00 2001
From: Igor Kudrin <ikudrin at accesssoftek.com>
Date: Thu, 12 Oct 2023 00:24:45 +0700
Subject: [PATCH 26/38] [Support] Add `\g<ref>` backreferences in Regex::sub()
 (#67220)

The existing format of backreferences, `\<ref>`, does not allow digits
to be placed directly after the reference because they are included in
the reference number. The new format solves this problem by adding
explicit delimiters.
---
 llvm/include/llvm/Support/Regex.h    |  5 +++--
 llvm/lib/Support/Regex.cpp           | 19 +++++++++++++++++++
 llvm/unittests/Support/RegexTest.cpp | 28 ++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Support/Regex.h b/llvm/include/llvm/Support/Regex.h
index ae4b9516f194e3a..bb7a8009b6bd0c3 100644
--- a/llvm/include/llvm/Support/Regex.h
+++ b/llvm/include/llvm/Support/Regex.h
@@ -85,8 +85,9 @@ namespace llvm {
                std::string *Error = nullptr) const;
 
     /// sub - Return the result of replacing the first match of the regex in
-    /// \p String with the \p Repl string. Backreferences like "\0" in the
-    /// replacement string are replaced with the appropriate match substring.
+    /// \p String with the \p Repl string. Backreferences like "\0" and "\g<1>"
+    /// in the replacement string are replaced with the appropriate match
+    /// substring.
     ///
     /// Note that the replacement string has backslash escaping performed on
     /// it. Invalid backreferences are ignored (replaced by empty strings).
diff --git a/llvm/lib/Support/Regex.cpp b/llvm/lib/Support/Regex.cpp
index dfbd373e4a98096..8fa71a749cc8e10 100644
--- a/llvm/lib/Support/Regex.cpp
+++ b/llvm/lib/Support/Regex.cpp
@@ -163,6 +163,25 @@ std::string Regex::sub(StringRef Repl, StringRef String,
 
     // FIXME: We should have a StringExtras function for mapping C99 escapes.
     switch (Repl[0]) {
+
+      // Backreference with the "\g<ref>" syntax
+    case 'g':
+      if (Repl.size() >= 4 && Repl[1] == '<') {
+        size_t End = Repl.find('>');
+        StringRef Ref = Repl.slice(2, End);
+        unsigned RefValue;
+        if (End != StringRef::npos && !Ref.getAsInteger(10, RefValue)) {
+          Repl = Repl.substr(End + 1);
+          if (RefValue < Matches.size())
+            Res += Matches[RefValue];
+          else if (Error && Error->empty())
+            *Error =
+                ("invalid backreference string 'g<" + Twine(Ref) + ">'").str();
+          break;
+        }
+      }
+      [[fallthrough]];
+
       // Treat all unrecognized characters as self-quoting.
     default:
       Res += Repl[0];
diff --git a/llvm/unittests/Support/RegexTest.cpp b/llvm/unittests/Support/RegexTest.cpp
index 78f37cdbd1ef89e..e3c721b466c6ccd 100644
--- a/llvm/unittests/Support/RegexTest.cpp
+++ b/llvm/unittests/Support/RegexTest.cpp
@@ -127,6 +127,34 @@ TEST_F(RegexTest, Substitution) {
 
   EXPECT_EQ("aber", Regex("a[0-9]+b").sub("a\\100b", "a1234ber", &Error));
   EXPECT_EQ(Error, "invalid backreference string '100'");
+
+  EXPECT_EQ("012345", Regex("a([0-9]+).*").sub("0\\g<1>5", "a1234ber", &Error));
+  EXPECT_EQ("", Error);
+
+  EXPECT_EQ("0a1234ber5",
+            Regex("a([0-9]+).*").sub("0\\g<0>5", "a1234ber", &Error));
+  EXPECT_EQ("", Error);
+
+  EXPECT_EQ("0A5", Regex("a(.)(.)(.)(.)(.)(.)(.)(.)(.)(.).*")
+                       .sub("0\\g<10>5", "a123456789Aber", &Error));
+  EXPECT_EQ("", Error);
+
+  EXPECT_EQ("0g<-1>5",
+            Regex("a([0-9]+).*").sub("0\\g<-1>5", "a1234ber", &Error));
+  EXPECT_EQ("", Error);
+
+  EXPECT_EQ("0g<15", Regex("a([0-9]+).*").sub("0\\g<15", "a1234ber", &Error));
+  EXPECT_EQ("", Error);
+
+  EXPECT_EQ("0g<>15", Regex("a([0-9]+).*").sub("0\\g<>15", "a1234ber", &Error));
+  EXPECT_EQ("", Error);
+
+  EXPECT_EQ("0g<3e>1",
+            Regex("a([0-9]+).*").sub("0\\g<3e>1", "a1234ber", &Error));
+  EXPECT_EQ("", Error);
+
+  EXPECT_EQ("aber", Regex("a([0-9]+)b").sub("a\\g<100>b", "a1234ber", &Error));
+  EXPECT_EQ(Error, "invalid backreference string 'g<100>'");
 }
 
 TEST_F(RegexTest, IsLiteralERE) {

>From 1f3755b7dd91b0c4e9643443cbe32ccaea074302 Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu at users.noreply.github.com>
Date: Wed, 11 Oct 2023 10:47:58 -0700
Subject: [PATCH 27/38] [mlir][sparse] fix unused variable warning on release
 build (#68824)

---
 .../lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp | 2 ++
 .../Dialect/SparseTensor/Transforms/StageSparseOperations.cpp   | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index fdecfe303d31351..78f5562b392a682 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -710,6 +710,8 @@ struct SparseSortCOOConverter : public OpConversionPattern<ConvertOp> {
 
     // Otherwise we need another data shuffle and a non-identity map.
     assert(dstStt.hasSameDimToLvl(srcStt));
+    (void)dstStt; // to silence warning when assertion is disabled
+
     auto id = AffineMap::getMultiDimIdentityMap(srcStt.getLvlRank(), ctx);
 
     rewriter.create<SortOp>(loc, nnz, crd, ValueRange{val}, id,
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp
index 60ac71de4dd71ca..4ab4b05a7a420c7 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/StageSparseOperations.cpp
@@ -36,6 +36,8 @@ struct StageUnorderedConvert : public OpRewritePattern<ConvertOp> {
     // source -> coo
     // The tmp COO must be unordered, otherwise it is a direct conversion.
     assert(!(srcStt.hasSameDimToLvl(dstStt) && srcStt.isAllOrdered()));
+    (void)srcStt; // to silence warning when assertion is disabled
+
     Type srcCOOTp = getCOOFromTypeWithOrdering(
         dstStt.getRankedTensorType(), dstStt.getDimToLvl(), /*ordered=*/false);
     Value srcCOO = rewriter.create<ConvertOp>(loc, srcCOOTp, op.getSource());

>From 7a5bd045e838e3d881da5f96eb0284fd1fa17b9a Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald at gigawatt.nl>
Date: Wed, 11 Oct 2023 19:20:36 +0100
Subject: [PATCH 28/38] [X86] Use indirect addressing for high 2GB of x32
 address space

Instructions that take immediate addresses sign-extend their operands, so cannot be used when we actually need zero extension. Use indirect addressing to avoid problems.

The functions in the test are a modified versions of the functions by the same names in large-constants.ll, with i64 types changed to i32.

Fixes #55061

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D124406
---
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp      | 20 +++++++++++-
 llvm/test/CodeGen/X86/large-constants-x32.ll | 32 ++++++++++++--------
 2 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 694a2df195c0922..93e184eca9bc515 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -1699,10 +1699,28 @@ bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
     if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
         !isDispSafeForFrameIndex(Val))
       return true;
+    // In ILP32 (x32) mode, pointers are 32 bits and need to be zero-extended to
+    // 64 bits. Instructions with 32-bit register addresses perform this zero
+    // extension for us and we can safely ignore the high bits of Offset.
+    // Instructions with only a 32-bit immediate address do not, though: they
+    // sign extend instead. This means only address the low 2GB of address space
+    // is directly addressable, we need indirect addressing for the high 2GB of
+    // address space.
+    // TODO: Some of the earlier checks may be relaxed for ILP32 mode as the
+    // implicit zero extension of instructions would cover up any problem.
+    // However, we have asserts elsewhere that get triggered if we do, so keep
+    // the checks for now.
+    // TODO: We would actually be able to accept these, as well as the same
+    // addresses in LP64 mode, by adding the EIZ pseudo-register as an operand
+    // to get an address size override to be emitted. However, this
+    // pseudo-register is not part of any register class and therefore causes
+    // MIR verification to fail.
+    if (Subtarget->isTarget64BitILP32() && !isUInt<31>(Val) &&
+        !AM.hasBaseOrIndexReg())
+      return true;
   }
   AM.Disp = Val;
   return false;
-
 }
 
 bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM,
diff --git a/llvm/test/CodeGen/X86/large-constants-x32.ll b/llvm/test/CodeGen/X86/large-constants-x32.ll
index e875920b8b7054e..95a75006e085ada 100644
--- a/llvm/test/CodeGen/X86/large-constants-x32.ll
+++ b/llvm/test/CodeGen/X86/large-constants-x32.ll
@@ -4,12 +4,16 @@
 define void @constant_expressions() {
 ; CHECK-LABEL: constant_expressions:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl -289477652, %eax
-; CHECK-NEXT:    movl -289477644, %ecx
-; CHECK-NEXT:    addl -289477648, %eax
-; CHECK-NEXT:    addl -289477636, %ecx
-; CHECK-NEXT:    addl %eax, %ecx
-; CHECK-NEXT:    movl %ecx, -289477652
+; CHECK-NEXT:    movl $-289477652, %eax # imm = 0xEEBEEBEC
+; CHECK-NEXT:    movl (%eax), %ecx
+; CHECK-NEXT:    movl $-289477644, %edx # imm = 0xEEBEEBF4
+; CHECK-NEXT:    movl (%edx), %edx
+; CHECK-NEXT:    movl $-289477648, %esi # imm = 0xEEBEEBF0
+; CHECK-NEXT:    addl (%esi), %ecx
+; CHECK-NEXT:    movl $-289477636, %esi # imm = 0xEEBEEBFC
+; CHECK-NEXT:    addl (%esi), %edx
+; CHECK-NEXT:    addl %ecx, %edx
+; CHECK-NEXT:    movl %edx, (%eax)
 ; CHECK-NEXT:    retq
 entry:
   %0 = load i32, i32* inttoptr (i32 add (i32 -289477652, i32 0) to i32*)
@@ -27,12 +31,16 @@ entry:
 define void @constant_expressions2() {
 ; CHECK-LABEL: constant_expressions2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl -289477652, %eax
-; CHECK-NEXT:    movl -289477644, %ecx
-; CHECK-NEXT:    addl -289477648, %eax
-; CHECK-NEXT:    addl -289477640, %ecx
-; CHECK-NEXT:    addl %eax, %ecx
-; CHECK-NEXT:    movl %ecx, -289477652
+; CHECK-NEXT:    movl $-289477652, %eax # imm = 0xEEBEEBEC
+; CHECK-NEXT:    movl (%eax), %ecx
+; CHECK-NEXT:    movl $-289477644, %edx # imm = 0xEEBEEBF4
+; CHECK-NEXT:    movl (%edx), %edx
+; CHECK-NEXT:    movl $-289477648, %esi # imm = 0xEEBEEBF0
+; CHECK-NEXT:    addl (%esi), %ecx
+; CHECK-NEXT:    movl $-289477640, %esi # imm = 0xEEBEEBF8
+; CHECK-NEXT:    addl (%esi), %edx
+; CHECK-NEXT:    addl %ecx, %edx
+; CHECK-NEXT:    movl %edx, (%eax)
 ; CHECK-NEXT:    retq
 entry:
   %0 = load i32, i32* inttoptr (i32 -289477652 to i32*)

>From e8a9e94b948ed8e473e95348e46326082d14f7eb Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Wed, 11 Oct 2023 11:22:13 -0700
Subject: [PATCH 29/38] [llvm-cov gcov] Make recursive propagateCounts
 iterative. NFC

propagateCounts computes unmeasured arc counts (see
commit b9d086693b5baebc477793af0d86a447bae01b6f).

In a x86-64 build using -O3 -fno-omit-frame-pointer, propagateCounts uses 80
bytes per stack frame. If a function contains 1e5 basic blocks on a tree path
(Kirchoff's circuit law optimization), the used stack space will be 8MB (default
ulimit -s in many configurations). (In a -O0 build, a stack frame costs 224
bytes.) 1e5 is ample for most configurations. However, for library users using
threads (e.g. in RPC handlers), a remaining thread stack of 64KiB allows just
819 stack frames, which is too limited.

Switch to an iterative form to avoid stack overflow issues. Iterative forms
match other iterative form functions in this file
(https://reviews.llvm.org/D93073).

Alternative to #68455
---
 llvm/include/llvm/ProfileData/GCOV.h |  2 +-
 llvm/lib/ProfileData/GCOV.cpp        | 73 ++++++++++++++++++++--------
 2 files changed, 55 insertions(+), 20 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/GCOV.h b/llvm/include/llvm/ProfileData/GCOV.h
index 674260c81fa6a24..c8af71dbf61ef1f 100644
--- a/llvm/include/llvm/ProfileData/GCOV.h
+++ b/llvm/include/llvm/ProfileData/GCOV.h
@@ -249,7 +249,7 @@ class GCOVFunction {
     return make_range(blocks.begin(), blocks.end());
   }
 
-  uint64_t propagateCounts(const GCOVBlock &v, GCOVArc *pred);
+  void propagateCounts(const GCOVBlock &v, GCOVArc *pred);
   void print(raw_ostream &OS) const;
   void dump() const;
 
diff --git a/llvm/lib/ProfileData/GCOV.cpp b/llvm/lib/ProfileData/GCOV.cpp
index 0a3330fde1d100f..5fce3dd5f7b7007 100644
--- a/llvm/lib/ProfileData/GCOV.cpp
+++ b/llvm/lib/ProfileData/GCOV.cpp
@@ -365,25 +365,60 @@ GCOVBlock &GCOVFunction::getExitBlock() const {
 // For each basic block, the sum of incoming edge counts equals the sum of
 // outgoing edge counts by Kirchoff's circuit law. If the unmeasured arcs form a
 // spanning tree, the count for each unmeasured arc (GCOV_ARC_ON_TREE) can be
-// uniquely identified.
-uint64_t GCOVFunction::propagateCounts(const GCOVBlock &v, GCOVArc *pred) {
-  // If GCOV_ARC_ON_TREE edges do form a tree, visited is not needed; otherwise
-  // this prevents infinite recursion.
-  if (!visited.insert(&v).second)
-    return 0;
-
-  uint64_t excess = 0;
-  for (GCOVArc *e : v.srcs())
-    if (e != pred)
-      excess += e->onTree() ? propagateCounts(e->src, e) : e->count;
-  for (GCOVArc *e : v.dsts())
-    if (e != pred)
-      excess -= e->onTree() ? propagateCounts(e->dst, e) : e->count;
-  if (int64_t(excess) < 0)
-    excess = -excess;
-  if (pred)
-    pred->count = excess;
-  return excess;
+// uniquely identified. Use an iterative algorithm to decrease stack usage for
+// library users in threads. See the edge propagation algorithm in Optimally
+// Profiling and Tracing Programs, ACM Transactions on Programming Languages and
+// Systems, 1994.
+void GCOVFunction::propagateCounts(const GCOVBlock &v, GCOVArc *pred) {
+  struct Elem {
+    const GCOVBlock &v;
+    GCOVArc *pred;
+    bool inDst;
+    size_t i = 0;
+    uint64_t excess = 0;
+  };
+
+  SmallVector<Elem, 0> stack;
+  stack.push_back({v, pred, false});
+  for (;;) {
+    Elem &u = stack.back();
+    // If GCOV_ARC_ON_TREE edges do form a tree, visited is not needed;
+    // otherwise, this prevents infinite recursion for bad input.
+    if (u.i == 0 && !visited.insert(&u.v).second) {
+      stack.pop_back();
+      if (stack.empty())
+        break;
+      continue;
+    }
+    if (u.i < u.v.pred.size()) {
+      GCOVArc *e = u.v.pred[u.i++];
+      if (e != u.pred) {
+        if (e->onTree())
+          stack.push_back({e->src, e, /*inDst=*/false});
+        else
+          u.excess += e->count;
+      }
+    } else if (u.i < u.v.pred.size() + u.v.succ.size()) {
+      GCOVArc *e = u.v.succ[u.i++ - u.v.pred.size()];
+      if (e != u.pred) {
+        if (e->onTree())
+          stack.push_back({e->dst, e, /*inDst=*/true});
+        else
+          u.excess -= e->count;
+      }
+    } else {
+      uint64_t excess = u.excess;
+      if (static_cast<int64_t>(excess) < 0)
+        excess = -excess;
+      if (u.pred)
+        u.pred->count = excess;
+      bool inDst = u.inDst;
+      stack.pop_back();
+      if (stack.empty())
+        break;
+      stack.back().excess += inDst ? -excess : excess;
+    }
+  }
 }
 
 void GCOVFunction::print(raw_ostream &OS) const {

>From 67c16b795b2e12f9bf1868b8925d4fcf0c7b2087 Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu at users.noreply.github.com>
Date: Wed, 11 Oct 2023 11:23:25 -0700
Subject: [PATCH 30/38] [mlir][sparse] remove tests (#68826)

---
 .../SparseTensor/codegen_sparse_dealloc.mlir  | 29 -------------------
 1 file changed, 29 deletions(-)
 delete mode 100644 mlir/test/Dialect/SparseTensor/codegen_sparse_dealloc.mlir

diff --git a/mlir/test/Dialect/SparseTensor/codegen_sparse_dealloc.mlir b/mlir/test/Dialect/SparseTensor/codegen_sparse_dealloc.mlir
deleted file mode 100644
index 49994a33c1911c6..000000000000000
--- a/mlir/test/Dialect/SparseTensor/codegen_sparse_dealloc.mlir
+++ /dev/null
@@ -1,29 +0,0 @@
-// UNSUPPORTED: target={{.*}}
-// TODO: the test is temporarily disabled (we probably do not need the option anymore by switch to buffer deallcation pass)
-//
-// RUN: mlir-opt %s --post-sparsification-rewrite="enable-runtime-library=false" \
-// RUN:    --sparse-tensor-codegen=create-sparse-deallocs=false \
-// RUN:    --canonicalize --cse | FileCheck %s -check-prefix=CHECK-NO-DEALLOC
-
-// RUN: mlir-opt %s --post-sparsification-rewrite="enable-runtime-library=false" \
-// RUN:    --sparse-tensor-codegen=create-sparse-deallocs=true \
-// RUN:    --canonicalize --cse | FileCheck %s -check-prefix=CHECK-DEALLOC
-
-#CSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : dense, d1 : compressed)}>
-#CSC = #sparse_tensor.encoding<{
-  map = (d0, d1) -> (d1 : dense, d0 : compressed),
-}>
-
-//
-// No memref.dealloc is user-requested so
-// CHECK-NO-DEALLOC-LABEL: @sparse_convert_permuted
-// CHECK-NO-DEALLOC-NOT: memref.dealloc
-//
-// Otherwise memref.dealloc is created to free temporary sparse buffers.
-// CHECK-DEALLOC-LABEL: @sparse_convert_permuted
-// CHECK-DEALLOC: memref.dealloc
-//
-func.func @sparse_convert_permuted(%arg0: tensor<?x?xf32, #CSR>) -> tensor<?x?xf32, #CSC> {
-  %0 = sparse_tensor.convert %arg0 : tensor<?x?xf32, #CSR> to tensor<?x?xf32, #CSC>
-  return %0 : tensor<?x?xf32, #CSC>
-}

>From 51f07655f4679a7a1a0cebed0fc5b1b55181b8cb Mon Sep 17 00:00:00 2001
From: Alexander Richardson <alexrichardson at google.com>
Date: Wed, 11 Oct 2023 11:35:11 -0700
Subject: [PATCH 31/38] [libunwind] Fix wrong end argument passed to
 decodeEHHdr() (#68813)

All but one callsite were actually passing start+length arguments.
This should not have any functional change since the end argument is
almost always ignored.
I noticed this while debugging some incorrect error messages being
printed while running the testsuite baremetal (using binaries that did
not have a valid eh_frame_hdr section): the tests print
`libunwind: unsupported .eh_frame_hdr version: 20 at
https://github.com/arichardson/upstream-llvm-project/commit/8000d308146ebf49cb364cb600e28a0a42e22c83`
because
libunwind is reading nonsense data for .eh_frame_hdr.
---
 libunwind/src/AddressSpace.hpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/libunwind/src/AddressSpace.hpp b/libunwind/src/AddressSpace.hpp
index 1abbc822546878d..5551c7d4bef1c56 100644
--- a/libunwind/src/AddressSpace.hpp
+++ b/libunwind/src/AddressSpace.hpp
@@ -414,8 +414,8 @@ static bool checkForUnwindInfoSegment(const Elf_Phdr *phdr, size_t image_base,
     cbdata->sects->dwarf_index_section = eh_frame_hdr_start;
     cbdata->sects->dwarf_index_section_length = phdr->p_memsz;
     if (EHHeaderParser<LocalAddressSpace>::decodeEHHdr(
-            *cbdata->addressSpace, eh_frame_hdr_start, phdr->p_memsz,
-            hdrInfo)) {
+            *cbdata->addressSpace, eh_frame_hdr_start,
+            eh_frame_hdr_start + phdr->p_memsz, hdrInfo)) {
       // .eh_frame_hdr records the start of .eh_frame, but not its size.
       // Rely on a zero terminator to find the end of the section.
       cbdata->sects->dwarf_section = hdrInfo.eh_frame_ptr;
@@ -638,7 +638,8 @@ inline bool LocalAddressSpace::findUnwindSections(pint_t targetAddr,
     info.dwarf_index_section_length = SIZE_MAX;
     EHHeaderParser<LocalAddressSpace>::EHHeaderInfo hdrInfo;
     if (!EHHeaderParser<LocalAddressSpace>::decodeEHHdr(
-            *this, info.dwarf_index_section, info.dwarf_index_section_length,
+            *this, info.dwarf_index_section,
+            info.dwarf_index_section + info.dwarf_index_section_length,
             hdrInfo)) {
       return false;
     }

>From 7430d6e496a15f10ff75693211809959c7482400 Mon Sep 17 00:00:00 2001
From: Alexander Richardson <alexrichardson at google.com>
Date: Wed, 11 Oct 2023 11:46:09 -0700
Subject: [PATCH 32/38] [libunwind] Avoid reading OOB for non-existent
 .eh_frame_hdr (#68815)

I was running the tests with baremetal picolibc which has a linker
script that __eh_frame_start==__eh_frame_end (not equal to zero) in
case there is no .eh_frame_hdr.
I noticed that libunwind was trying to read nonsense data because it
was printing messages such as
`libunwind: unsupported .eh_frame_hdr version: 20 at
https://github.com/llvm/llvm-project/commit/8000d308146ebf49cb364cb600e28a0a42e22c83`

This change adds a ehHdr size check to avoid reading this out-of-bounds
data and potentially crashing.
---
 libunwind/src/EHHeaderParser.hpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/libunwind/src/EHHeaderParser.hpp b/libunwind/src/EHHeaderParser.hpp
index ed4317c89055c9e..0662a1321e2c75e 100644
--- a/libunwind/src/EHHeaderParser.hpp
+++ b/libunwind/src/EHHeaderParser.hpp
@@ -55,6 +55,19 @@ template <typename A>
 bool EHHeaderParser<A>::decodeEHHdr(A &addressSpace, pint_t ehHdrStart,
                                     pint_t ehHdrEnd, EHHeaderInfo &ehHdrInfo) {
   pint_t p = ehHdrStart;
+
+  // Ensure that we don't read data beyond the end of .eh_frame_hdr
+  if (ehHdrEnd - ehHdrStart < 4) {
+    // Don't print a message for an empty .eh_frame_hdr (this can happen if
+    // the linker script defines symbols for it even in the empty case).
+    if (ehHdrEnd == ehHdrStart)
+      return false;
+    _LIBUNWIND_LOG("unsupported .eh_frame_hdr at %" PRIx64
+                   ": need at least 4 bytes of data but only got %zd",
+                   static_cast<uint64_t>(ehHdrStart),
+                   static_cast<size_t>(ehHdrEnd - ehHdrStart));
+    return false;
+  }
   uint8_t version = addressSpace.get8(p++);
   if (version != 1) {
     _LIBUNWIND_LOG("unsupported .eh_frame_hdr version: %" PRIu8 " at %" PRIx64,

>From df172894d7f99ba3105c063a573ffd8b22feb6f0 Mon Sep 17 00:00:00 2001
From: Stanislav Gatev <sgatev at google.com>
Date: Wed, 11 Oct 2023 22:18:46 +0200
Subject: [PATCH 33/38] [clang][dataflow] Add support for lambda captures
 (#68558)

This adds support for copy, ref, and this lambda captures to the core
framework and also adds relevant tests in UncheckedOptionalAccessTest.
---
 .../FlowSensitive/DataflowAnalysisContext.h   |   2 +-
 .../FlowSensitive/DataflowEnvironment.h       |   6 +-
 .../FlowSensitive/DataflowAnalysisContext.cpp |   2 +-
 .../FlowSensitive/DataflowEnvironment.cpp     |  24 +-
 .../Analysis/FlowSensitive/TestingSupport.cpp |  15 +-
 .../Analysis/FlowSensitive/TestingSupport.h   |  16 +-
 .../Analysis/FlowSensitive/TransferTest.cpp   | 235 ++++++++++++++++++
 .../UncheckedOptionalAccessModelTest.cpp      | 137 ++++++++++
 8 files changed, 422 insertions(+), 15 deletions(-)

diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
index 4698f0616e66e82..c46109a02921e7f 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
@@ -98,7 +98,7 @@ class DataflowAnalysisContext {
   StorageLocation &createStorageLocation(QualType Type);
 
   /// Returns a stable storage location for `D`.
-  StorageLocation &getStableStorageLocation(const VarDecl &D);
+  StorageLocation &getStableStorageLocation(const ValueDecl &D);
 
   /// Returns a stable storage location for `E`.
   StorageLocation &getStableStorageLocation(const Expr &E);
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
index 73f747ff88cf447..56d647f35b08430 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -242,7 +242,7 @@ class Environment {
   /// Creates a storage location for `D`. Does not assign the returned storage
   /// location to `D` in the environment. Does not assign a value to the
   /// returned storage location in the environment.
-  StorageLocation &createStorageLocation(const VarDecl &D);
+  StorageLocation &createStorageLocation(const ValueDecl &D);
 
   /// Creates a storage location for `E`. Does not assign the returned storage
   /// location to `E` in the environment. Does not assign a value to the
@@ -408,7 +408,7 @@ class Environment {
   /// this value. Otherwise, initializes the object with a value created using
   /// `createValue()`.  Uses the storage location returned by
   /// `DataflowAnalysisContext::getStableStorageLocation(D)`.
-  StorageLocation &createObject(const VarDecl &D, const Expr *InitExpr) {
+  StorageLocation &createObject(const ValueDecl &D, const Expr *InitExpr) {
     return createObjectInternal(&D, D.getType(), InitExpr);
   }
 
@@ -614,7 +614,7 @@ class Environment {
 
   /// Shared implementation of `createObject()` overloads.
   /// `D` and `InitExpr` may be null.
-  StorageLocation &createObjectInternal(const VarDecl *D, QualType Ty,
+  StorageLocation &createObjectInternal(const ValueDecl *D, QualType Ty,
                                         const Expr *InitExpr);
 
   /// Shared implementation of `pushCall` overloads. Note that unlike
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
index e81048ce9233808..fa9b40fc49b3ae7 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
@@ -73,7 +73,7 @@ StorageLocation &DataflowAnalysisContext::createStorageLocation(QualType Type) {
 }
 
 StorageLocation &
-DataflowAnalysisContext::getStableStorageLocation(const VarDecl &D) {
+DataflowAnalysisContext::getStableStorageLocation(const ValueDecl &D) {
   if (auto *Loc = DeclToLoc.lookup(&D))
     return *Loc;
   auto &Loc = createStorageLocation(D.getType().getNonReferenceType());
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index 66d98c995468595..01c6cc69e2b9fac 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -448,11 +448,23 @@ Environment::Environment(DataflowAnalysisContext &DACtx,
   if (const auto *MethodDecl = dyn_cast<CXXMethodDecl>(&DeclCtx)) {
     auto *Parent = MethodDecl->getParent();
     assert(Parent != nullptr);
-    if (Parent->isLambda())
-      MethodDecl = dyn_cast<CXXMethodDecl>(Parent->getDeclContext());
 
-    // FIXME: Initialize the ThisPointeeLoc of lambdas too.
-    if (MethodDecl && MethodDecl->isImplicitObjectMemberFunction()) {
+    if (Parent->isLambda()) {
+      for (auto Capture : Parent->captures()) {
+        if (Capture.capturesVariable()) {
+          const auto *VarDecl = Capture.getCapturedVar();
+          assert(VarDecl != nullptr);
+          setStorageLocation(*VarDecl, createObject(*VarDecl, nullptr));
+        } else if (Capture.capturesThis()) {
+          const auto *SurroundingMethodDecl =
+              cast<CXXMethodDecl>(DeclCtx.getNonClosureAncestor());
+          QualType ThisPointeeType =
+              SurroundingMethodDecl->getFunctionObjectParameterType();
+          ThisPointeeLoc =
+              &cast<RecordValue>(createValue(ThisPointeeType))->getLoc();
+        }
+      }
+    } else if (MethodDecl->isImplicitObjectMemberFunction()) {
       QualType ThisPointeeType = MethodDecl->getFunctionObjectParameterType();
       ThisPointeeLoc =
           &cast<RecordValue>(createValue(ThisPointeeType))->getLoc();
@@ -673,7 +685,7 @@ StorageLocation &Environment::createStorageLocation(QualType Type) {
   return DACtx->createStorageLocation(Type);
 }
 
-StorageLocation &Environment::createStorageLocation(const VarDecl &D) {
+StorageLocation &Environment::createStorageLocation(const ValueDecl &D) {
   // Evaluated declarations are always assigned the same storage locations to
   // ensure that the environment stabilizes across loop iterations. Storage
   // locations for evaluated declarations are stored in the analysis context.
@@ -885,7 +897,7 @@ Environment::createLocAndMaybeValue(QualType Ty,
   return Loc;
 }
 
-StorageLocation &Environment::createObjectInternal(const VarDecl *D,
+StorageLocation &Environment::createObjectInternal(const ValueDecl *D,
                                                    QualType Ty,
                                                    const Expr *InitExpr) {
   if (Ty->isReferenceType()) {
diff --git a/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp b/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp
index 72bdfee26fe7f3e..65c527ae63d2d71 100644
--- a/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp
@@ -161,7 +161,18 @@ llvm::Error test::checkDataflowWithNoopAnalysis(
         VerifyResults,
     DataflowAnalysisOptions Options, LangStandard::Kind Std,
     llvm::StringRef TargetFun) {
-  using ast_matchers::hasName;
+  return checkDataflowWithNoopAnalysis(Code, ast_matchers::hasName(TargetFun),
+                                       VerifyResults, Options, Std);
+}
+
+llvm::Error test::checkDataflowWithNoopAnalysis(
+    llvm::StringRef Code,
+    ast_matchers::internal::Matcher<FunctionDecl> TargetFuncMatcher,
+    std::function<
+        void(const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &,
+             ASTContext &)>
+        VerifyResults,
+    DataflowAnalysisOptions Options, LangStandard::Kind Std) {
   llvm::SmallVector<std::string, 3> ASTBuildArgs = {
       // -fnodelayed-template-parsing is the default everywhere but on Windows.
       // Set it explicitly so that tests behave the same on Windows as on other
@@ -170,7 +181,7 @@ llvm::Error test::checkDataflowWithNoopAnalysis(
       "-std=" +
           std::string(LangStandard::getLangStandardForKind(Std).getName())};
   AnalysisInputs<NoopAnalysis> AI(
-      Code, hasName(TargetFun),
+      Code, TargetFuncMatcher,
       [UseBuiltinModel = Options.BuiltinOpts.has_value()](ASTContext &C,
                                                           Environment &Env) {
         return NoopAnalysis(
diff --git a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
index 44d962d5da9a97b..de3046e22897c3e 100644
--- a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
+++ b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
@@ -398,8 +398,8 @@ checkDataflow(AnalysisInputs<AnalysisT> AI,
 
 using BuiltinOptions = DataflowAnalysisContext::Options;
 
-/// Runs dataflow on `Code` with a `NoopAnalysis` and calls `VerifyResults` to
-/// verify the results.
+/// Runs dataflow on function named `TargetFun` in `Code` with a `NoopAnalysis`
+/// and calls `VerifyResults` to verify the results.
 llvm::Error checkDataflowWithNoopAnalysis(
     llvm::StringRef Code,
     std::function<
@@ -410,6 +410,18 @@ llvm::Error checkDataflowWithNoopAnalysis(
     LangStandard::Kind Std = LangStandard::lang_cxx17,
     llvm::StringRef TargetFun = "target");
 
+/// Runs dataflow on function matched by `TargetFuncMatcher` in `Code` with a
+/// `NoopAnalysis` and calls `VerifyResults` to verify the results.
+llvm::Error checkDataflowWithNoopAnalysis(
+    llvm::StringRef Code,
+    ast_matchers::internal::Matcher<FunctionDecl> TargetFuncMatcher,
+    std::function<
+        void(const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &,
+             ASTContext &)>
+        VerifyResults = [](const auto &, auto &) {},
+    DataflowAnalysisOptions Options = {BuiltinOptions()},
+    LangStandard::Kind Std = LangStandard::lang_cxx17);
+
 /// Returns the `ValueDecl` for the given identifier.
 ///
 /// Requirements:
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index 2e8d38490fdb864..632632a1b30e78b 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -66,6 +66,37 @@ void runDataflow(
               Std, TargetFun);
 }
 
+void runDataflowOnLambda(
+    llvm::StringRef Code,
+    std::function<
+        void(const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &,
+             ASTContext &)>
+        VerifyResults,
+    DataflowAnalysisOptions Options,
+    LangStandard::Kind Std = LangStandard::lang_cxx17) {
+  ASSERT_THAT_ERROR(
+      checkDataflowWithNoopAnalysis(
+          Code,
+          ast_matchers::hasDeclContext(
+              ast_matchers::cxxRecordDecl(ast_matchers::isLambda())),
+          VerifyResults, Options, Std),
+      llvm::Succeeded());
+}
+
+void runDataflowOnLambda(
+    llvm::StringRef Code,
+    std::function<
+        void(const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &,
+             ASTContext &)>
+        VerifyResults,
+    LangStandard::Kind Std = LangStandard::lang_cxx17,
+    bool ApplyBuiltinTransfer = true) {
+  runDataflowOnLambda(Code, std::move(VerifyResults),
+                      {ApplyBuiltinTransfer ? BuiltinOptions{}
+                                            : std::optional<BuiltinOptions>()},
+                      Std);
+}
+
 const Formula &getFormula(const ValueDecl &D, const Environment &Env) {
   return cast<BoolValue>(Env.getValue(D))->formula();
 }
@@ -5987,4 +6018,208 @@ TEST(TransferTest, EvaluateBlockWithUnreachablePreds) {
          ASTContext &ASTCtx) {});
 }
 
+TEST(TransferTest, LambdaCaptureByCopy) {
+  std::string Code = R"(
+    void target(int Foo, int Bar) {
+      [Foo]() {
+        (void)0;
+        // [[p]]
+      }();
+    }
+  )";
+  runDataflowOnLambda(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        ASSERT_THAT(Results.keys(), UnorderedElementsAre("p"));
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+
+        const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+        ASSERT_THAT(FooDecl, NotNull());
+
+        const StorageLocation *FooLoc = Env.getStorageLocation(*FooDecl);
+        ASSERT_TRUE(isa_and_nonnull<ScalarStorageLocation>(FooLoc));
+
+        const Value *FooVal = Env.getValue(*FooLoc);
+        EXPECT_TRUE(isa_and_nonnull<IntegerValue>(FooVal));
+
+        const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
+        ASSERT_THAT(BarDecl, NotNull());
+
+        const StorageLocation *BarLoc = Env.getStorageLocation(*BarDecl);
+        EXPECT_THAT(BarLoc, IsNull());
+      });
+}
+
+TEST(TransferTest, LambdaCaptureByReference) {
+  std::string Code = R"(
+    void target(int Foo, int Bar) {
+      [&Foo]() {
+        (void)0;
+        // [[p]]
+      }();
+    }
+  )";
+  runDataflowOnLambda(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        ASSERT_THAT(Results.keys(), UnorderedElementsAre("p"));
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+
+        const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+        ASSERT_THAT(FooDecl, NotNull());
+
+        const StorageLocation *FooLoc = Env.getStorageLocation(*FooDecl);
+        ASSERT_TRUE(isa_and_nonnull<ScalarStorageLocation>(FooLoc));
+
+        const Value *FooVal = Env.getValue(*FooLoc);
+        EXPECT_TRUE(isa_and_nonnull<IntegerValue>(FooVal));
+
+        const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
+        ASSERT_THAT(BarDecl, NotNull());
+
+        const StorageLocation *BarLoc = Env.getStorageLocation(*BarDecl);
+        EXPECT_THAT(BarLoc, IsNull());
+      });
+}
+
+TEST(TransferTest, LambdaCaptureWithInitializer) {
+  std::string Code = R"(
+    void target(int Bar) {
+      [Foo=Bar]() {
+        (void)0;
+        // [[p]]
+      }();
+    }
+  )";
+  runDataflowOnLambda(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        ASSERT_THAT(Results.keys(), UnorderedElementsAre("p"));
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+
+        const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+        ASSERT_THAT(FooDecl, NotNull());
+
+        const StorageLocation *FooLoc = Env.getStorageLocation(*FooDecl);
+        ASSERT_TRUE(isa_and_nonnull<ScalarStorageLocation>(FooLoc));
+
+        const Value *FooVal = Env.getValue(*FooLoc);
+        EXPECT_TRUE(isa_and_nonnull<IntegerValue>(FooVal));
+
+        const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
+        ASSERT_THAT(BarDecl, NotNull());
+
+        const StorageLocation *BarLoc = Env.getStorageLocation(*BarDecl);
+        EXPECT_THAT(BarLoc, IsNull());
+      });
+}
+
+TEST(TransferTest, LambdaCaptureByCopyImplicit) {
+  std::string Code = R"(
+    void target(int Foo, int Bar) {
+      [=]() {
+        Foo;
+        // [[p]]
+      }();
+    }
+  )";
+  runDataflowOnLambda(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        ASSERT_THAT(Results.keys(), UnorderedElementsAre("p"));
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+
+        const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+        ASSERT_THAT(FooDecl, NotNull());
+
+        const StorageLocation *FooLoc = Env.getStorageLocation(*FooDecl);
+        ASSERT_TRUE(isa_and_nonnull<ScalarStorageLocation>(FooLoc));
+
+        const Value *FooVal = Env.getValue(*FooLoc);
+        EXPECT_TRUE(isa_and_nonnull<IntegerValue>(FooVal));
+
+        const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
+        ASSERT_THAT(BarDecl, NotNull());
+
+        // There is no storage location for `Bar` because it isn't used in the
+        // body of the lambda.
+        const StorageLocation *BarLoc = Env.getStorageLocation(*BarDecl);
+        EXPECT_THAT(BarLoc, IsNull());
+      });
+}
+
+TEST(TransferTest, LambdaCaptureByReferenceImplicit) {
+  std::string Code = R"(
+    void target(int Foo, int Bar) {
+      [&]() {
+        Foo;
+        // [[p]]
+      }();
+    }
+  )";
+  runDataflowOnLambda(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        ASSERT_THAT(Results.keys(), UnorderedElementsAre("p"));
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+
+        const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+        ASSERT_THAT(FooDecl, NotNull());
+
+        const StorageLocation *FooLoc = Env.getStorageLocation(*FooDecl);
+        ASSERT_TRUE(isa_and_nonnull<ScalarStorageLocation>(FooLoc));
+
+        const Value *FooVal = Env.getValue(*FooLoc);
+        EXPECT_TRUE(isa_and_nonnull<IntegerValue>(FooVal));
+
+        const ValueDecl *BarDecl = findValueDecl(ASTCtx, "Bar");
+        ASSERT_THAT(BarDecl, NotNull());
+
+        // There is no storage location for `Bar` because it isn't used in the
+        // body of the lambda.
+        const StorageLocation *BarLoc = Env.getStorageLocation(*BarDecl);
+        EXPECT_THAT(BarLoc, IsNull());
+      });
+}
+
+TEST(TransferTest, LambdaCaptureThis) {
+  std::string Code = R"(
+    struct Bar {
+      int Foo;
+
+      void target() {
+        [this]() {
+          Foo;
+          // [[p]]
+        }();
+      }
+    };
+  )";
+  runDataflowOnLambda(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        ASSERT_THAT(Results.keys(), UnorderedElementsAre("p"));
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+
+        const RecordStorageLocation *ThisPointeeLoc =
+            Env.getThisPointeeStorageLocation();
+        ASSERT_THAT(ThisPointeeLoc, NotNull());
+
+        const ValueDecl *FooDecl = findValueDecl(ASTCtx, "Foo");
+        ASSERT_THAT(FooDecl, NotNull());
+
+        const StorageLocation *FooLoc = ThisPointeeLoc->getChild(*FooDecl);
+        ASSERT_TRUE(isa_and_nonnull<ScalarStorageLocation>(FooLoc));
+
+        const Value *FooVal = Env.getValue(*FooLoc);
+        EXPECT_TRUE(isa_and_nonnull<IntegerValue>(FooVal));
+      });
+}
+
 } // namespace
diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
index 3f17557272d0344..c41a378a8341b71 100644
--- a/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/UncheckedOptionalAccessModelTest.cpp
@@ -1280,6 +1280,12 @@ class UncheckedOptionalAccessTest
     ExpectDiagnosticsFor(SourceCode, ast_matchers::hasName("target"));
   }
 
+  void ExpectDiagnosticsForLambda(std::string SourceCode) {
+    ExpectDiagnosticsFor(
+        SourceCode, ast_matchers::hasDeclContext(
+                        ast_matchers::cxxRecordDecl(ast_matchers::isLambda())));
+  }
+
   template <typename FuncDeclMatcher>
   void ExpectDiagnosticsFor(std::string SourceCode,
                             FuncDeclMatcher FuncMatcher) {
@@ -3214,6 +3220,137 @@ TEST_P(UncheckedOptionalAccessTest, Bitfield) {
     }
   )");
 }
+
+TEST_P(UncheckedOptionalAccessTest, LambdaParam) {
+  ExpectDiagnosticsForLambda(R"(
+    #include "unchecked_optional_access_test.h"
+
+    void target() {
+      []($ns::$optional<int> opt) {
+        if (opt.has_value()) {
+          opt.value();
+        } else {
+          opt.value(); // [[unsafe]]
+        }
+      }(Make<$ns::$optional<int>>());
+    }
+  )");
+}
+
+TEST_P(UncheckedOptionalAccessTest, LambdaCaptureByCopy) {
+  ExpectDiagnosticsForLambda(R"(
+    #include "unchecked_optional_access_test.h"
+
+    void target($ns::$optional<int> opt) {
+      [opt]() {
+        if (opt.has_value()) {
+          opt.value();
+        } else {
+          opt.value(); // [[unsafe]]
+        }
+      }();
+    }
+  )");
+}
+
+TEST_P(UncheckedOptionalAccessTest, LambdaCaptureByReference) {
+  ExpectDiagnosticsForLambda(R"(
+    #include "unchecked_optional_access_test.h"
+
+    void target($ns::$optional<int> opt) {
+      [&opt]() {
+        if (opt.has_value()) {
+          opt.value();
+        } else {
+          opt.value(); // [[unsafe]]
+        }
+      }();
+    }
+  )");
+}
+
+TEST_P(UncheckedOptionalAccessTest, LambdaCaptureWithInitializer) {
+  ExpectDiagnosticsForLambda(R"(
+    #include "unchecked_optional_access_test.h"
+
+    void target($ns::$optional<int> opt) {
+      [opt2=opt]() {
+        if (opt2.has_value()) {
+          opt2.value();
+        } else {
+          opt2.value(); // [[unsafe]]
+        }
+      }();
+    }
+  )");
+}
+
+TEST_P(UncheckedOptionalAccessTest, LambdaCaptureByCopyImplicit) {
+  ExpectDiagnosticsForLambda(R"(
+    #include "unchecked_optional_access_test.h"
+
+    void target($ns::$optional<int> opt) {
+      [=]() {
+        if (opt.has_value()) {
+          opt.value();
+        } else {
+          opt.value(); // [[unsafe]]
+        }
+      }();
+    }
+  )");
+}
+
+TEST_P(UncheckedOptionalAccessTest, LambdaCaptureByReferenceImplicit) {
+  ExpectDiagnosticsForLambda(R"(
+    #include "unchecked_optional_access_test.h"
+
+    void target($ns::$optional<int> opt) {
+      [&]() {
+        if (opt.has_value()) {
+          opt.value();
+        } else {
+          opt.value(); // [[unsafe]]
+        }
+      }();
+    }
+  )");
+}
+
+TEST_P(UncheckedOptionalAccessTest, LambdaCaptureThis) {
+  ExpectDiagnosticsForLambda(R"(
+    #include "unchecked_optional_access_test.h"
+
+    struct Foo {
+      $ns::$optional<int> opt;
+
+      void target() {
+        [this]() {
+          if (opt.has_value()) {
+            opt.value();
+          } else {
+            opt.value(); // [[unsafe]]
+          }
+        }();
+      }
+    };
+  )");
+}
+
+TEST_P(UncheckedOptionalAccessTest, LambdaCaptureStateNotPropagated) {
+  // We can't propagate information from the surrounding context.
+  ExpectDiagnosticsForLambda(R"(
+    #include "unchecked_optional_access_test.h"
+
+    void target($ns::$optional<int> opt) {
+      if (opt.has_value()) {
+        [&opt]() {
+          opt.value(); // [[unsafe]]
+        }();
+      }
+    }
+  )");
+}
 // FIXME: Add support for:
 // - constructors (copy, move)
 // - assignment operators (default, copy, move)

>From c4e2dd5c5c9bfab1841bfcb940e4f7d0a55739c8 Mon Sep 17 00:00:00 2001
From: Amy Wang <kai.ting.wang at huawei.com>
Date: Wed, 11 Oct 2023 16:37:11 -0400
Subject: [PATCH 34/38] [mlir][python] python binding for the affine.store op
 (#68816)

This PR creates the necessary files to support bindings for operations
in the affine dialect.

This is the first of many PRs which will progressively introduce
affine.load, affine.for, etc operations. I would like to
acknowledge the work by Nelli's author @makslevental :
https://github.com/makslevental/nelli/blob/main/nelli/mlir/affine/affine.py
which jump-starts the work.
---
 mlir/python/CMakeLists.txt                   | 10 ++++
 mlir/python/mlir/dialects/AffineOps.td       | 14 +++++
 mlir/python/mlir/dialects/_affine_ops_ext.py | 56 ++++++++++++++++++++
 mlir/python/mlir/dialects/affine.py          |  5 ++
 mlir/test/python/dialects/affine.py          | 44 +++++++++++++++
 5 files changed, 129 insertions(+)
 create mode 100644 mlir/python/mlir/dialects/AffineOps.td
 create mode 100644 mlir/python/mlir/dialects/_affine_ops_ext.py
 create mode 100644 mlir/python/mlir/dialects/affine.py
 create mode 100644 mlir/test/python/dialects/affine.py

diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt
index 088d9a765b97730..c7b3c283a6b6dc1 100644
--- a/mlir/python/CMakeLists.txt
+++ b/mlir/python/CMakeLists.txt
@@ -46,6 +46,16 @@ declare_mlir_python_sources(MLIRPythonCAPI.HeaderSources
 # Dialect bindings
 ################################################################################
 
+declare_mlir_dialect_python_bindings(
+  ADD_TO_PARENT MLIRPythonSources.Dialects
+  ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
+  TD_FILE dialects/AffineOps.td
+  SOURCES
+    dialects/affine.py
+    dialects/_affine_ops_ext.py
+  DIALECT_NAME affine
+  GEN_ENUM_BINDINGS)
+
 declare_mlir_dialect_python_bindings(
   ADD_TO_PARENT MLIRPythonSources.Dialects
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
diff --git a/mlir/python/mlir/dialects/AffineOps.td b/mlir/python/mlir/dialects/AffineOps.td
new file mode 100644
index 000000000000000..e12ffafb8993097
--- /dev/null
+++ b/mlir/python/mlir/dialects/AffineOps.td
@@ -0,0 +1,14 @@
+//===-- AffineOps.td - Entry point for Affine bindings -----*- tablegen -*-===//
+//                                                                                                                      
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.                                        
+// See https://llvm.org/LICENSE.txt for license information.                                                            
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception                                                              
+//                                                                                                                      
+//===----------------------------------------------------------------------===// 
+
+#ifndef PYTHON_BINDINGS_AFFINE_OPS
+#define PYTHON_BINDINGS_AFFINE_OPS
+
+include "mlir/Dialect/Affine/IR/AffineOps.td"
+
+#endif
diff --git a/mlir/python/mlir/dialects/_affine_ops_ext.py b/mlir/python/mlir/dialects/_affine_ops_ext.py
new file mode 100644
index 000000000000000..dc465ce7aa1e5f9
--- /dev/null
+++ b/mlir/python/mlir/dialects/_affine_ops_ext.py
@@ -0,0 +1,56 @@
+#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#  See https://llvm.org/LICENSE.txt for license information.
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+try:
+    from ..ir import *
+    from ._ods_common import get_op_result_or_value as _get_op_result_or_value
+    from ._ods_common import get_op_results_or_values as _get_op_results_or_values
+except ImportError as e:
+    raise RuntimeError("Error loading imports from extension module") from e
+
+from typing import Optional, Sequence, Union
+
+
+class AffineStoreOp:
+    """Specialization for the Affine store operation."""
+
+    def __init__(
+        self,
+        value: Union[Operation, OpView, Value],
+        memref: Union[Operation, OpView, Value],
+        map: AffineMap=None,
+        *,
+        map_operands=None,
+        loc=None,
+        ip=None
+    ):
+        """Creates an affine store operation.
+
+        - `value`: the value to store into the memref.
+        - `memref`: the buffer to store into.
+        - `map`: the affine map that maps the map_operands to the index of the 
+          memref.
+        - `map_operands`: the list of arguments to substitute the dimensions, 
+          then symbols in the affine map, in increasing order.
+        """
+        map = map if map is not None else []
+        map_operands = map_operands if map_operands is not None else []
+        operands = [
+            _get_op_result_or_value(value),
+            _get_op_result_or_value(memref),
+            *[_get_op_result_or_value(op) for op in map_operands]
+        ]
+        results = []
+        attributes = {"map": AffineMapAttr.get(map)}
+        regions = None
+        _ods_successors = None
+        super().__init__(self.build_generic(
+            attributes=attributes,
+            results=results,
+            operands=operands,
+            successors=_ods_successors,
+            regions=regions,
+            loc=loc,
+            ip=ip
+        ))
diff --git a/mlir/python/mlir/dialects/affine.py b/mlir/python/mlir/dialects/affine.py
new file mode 100644
index 000000000000000..8a2a64c7c40d190
--- /dev/null
+++ b/mlir/python/mlir/dialects/affine.py
@@ -0,0 +1,5 @@
+#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.                                        
+#  See https://llvm.org/LICENSE.txt for license information.                                                            
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception    
+
+from ._affine_ops_gen import *
diff --git a/mlir/test/python/dialects/affine.py b/mlir/test/python/dialects/affine.py
new file mode 100644
index 000000000000000..d2e664d4653420f
--- /dev/null
+++ b/mlir/test/python/dialects/affine.py
@@ -0,0 +1,44 @@
+# RUN: %PYTHON %s | FileCheck %s
+
+from mlir.ir import *
+import mlir.dialects.func as func
+import mlir.dialects.arith as arith
+import mlir.dialects.affine as affine
+import mlir.dialects.memref as memref
+
+
+def run(f):
+    print("\nTEST:", f.__name__)
+    f()
+    return f
+
+
+# CHECK-LABEL: TEST: testAffineStoreOp
+ at run
+def testAffineStoreOp():
+    with Context() as ctx, Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            f32 = F32Type.get()
+            index_type = IndexType.get()
+            memref_type_out = MemRefType.get([12, 12], f32)
+
+            # CHECK: func.func @affine_store_test(%[[ARG0:.*]]: index) -> memref<12x12xf32> {
+            @func.FuncOp.from_py_func(index_type)
+            def affine_store_test(arg0):
+                # CHECK: %[[O_VAR:.*]] = memref.alloc() : memref<12x12xf32>
+                mem = memref.AllocOp(memref_type_out, [], []).result
+
+                d0 = AffineDimExpr.get(0)
+                s0 = AffineSymbolExpr.get(0)
+                map = AffineMap.get(1, 1, [s0 * 3, d0 + s0 + 1])
+
+                # CHECK: %[[A1:.*]] = arith.constant 2.100000e+00 : f32
+                a1 = arith.ConstantOp(f32, 2.1)
+
+                # CHECK: affine.store %[[A1]], %alloc[symbol(%[[ARG0]]) * 3, %[[ARG0]] + symbol(%[[ARG0]]) + 1] : memref<12x12xf32>
+                affine.AffineStoreOp(a1, mem, map, map_operands=[arg0, arg0])
+
+                return mem
+
+        print(module)

>From a36f8a234e63190ab2d05b9e35d9c730ea655b93 Mon Sep 17 00:00:00 2001
From: Walter Erquinigo <a20012251 at gmail.com>
Date: Wed, 11 Oct 2023 16:39:35 -0400
Subject: [PATCH 35/38] [lldb-vscode] Update installation instructions (#68234)

lldb-vscode had installation instructions based on creating a folder
inside ~/.vscode/extensions, which no longer works. A different
installation mechanism is needed based on a VSCode command. More can be
read in the contents of this patch.

Closes https://github.com/llvm/llvm-project/issues/63655
---
 lldb/tools/lldb-vscode/README.md | 115 ++++++++++++++++---------------
 1 file changed, 60 insertions(+), 55 deletions(-)

diff --git a/lldb/tools/lldb-vscode/README.md b/lldb/tools/lldb-vscode/README.md
index 154ccefc5f59798..6f930293126d53e 100644
--- a/lldb/tools/lldb-vscode/README.md
+++ b/lldb/tools/lldb-vscode/README.md
@@ -1,18 +1,20 @@
 
 # Table of Contents
 
-- [Introduction](#Introduction)
-- [Installation](#Installation-Visual-Studio-Code)
+- [Table of Contents](#table-of-contents)
+- [Introduction](#introduction)
+- [Installation for Visual Studio Code](#installation-for-visual-studio-code)
 - [Configurations](#configurations)
-	- [Launch Configuration Settings](#launch-configuration-settings)
-	- [Attach Configuration Settings](#attach-configuration-settings)
-	- [Example configurations](#example-configurations)
-		- [Launching](#launching)
-		- [Attach to process using process ID](#attach-using-pid)
-		- [Attach to process by name](#attach-by-name)
-		- [Loading a core file](#loading-a-core-file)
-- [Custom Debugger Commands](#custom-debugger-commands)
-  - [startDebugging](#startDebugging)
+  - [Launch Configuration Settings](#launch-configuration-settings)
+  - [Attaching Settings](#attaching-settings)
+  - [Example configurations](#example-configurations)
+    - [Launching](#launching)
+    - [Attach using PID](#attach-using-pid)
+    - [Attach by Name](#attach-by-name)
+    - [Loading a Core File](#loading-a-core-file)
+- [Custom debugger commands](#custom-debugger-commands)
+  - [startDebugging](#startdebugging)
+  - [repl-mode](#repl-mode)
 
 # Introduction
 
@@ -24,52 +26,57 @@ get a full featured debugger with a well defined protocol.
 
 # Installation for Visual Studio Code
 
-Installing the plug-in involves creating a directory in the `~/.vscode/extensions` folder and copying the package.json file that is in the same directory as this
-documentation into it, and copying to symlinking a lldb-vscode binary into
-the `bin` directory inside the plug-in directory.
-
-If you want to make a stand alone plug-in that you can send to others on unix systems:
-
-```
-$ mkdir -p ~/.vscode/extensions/llvm-org.lldb-vscode-0.1.0/bin
-$ cp package.json ~/.vscode/extensions/llvm-org.lldb-vscode-0.1.0
-$ cd ~/.vscode/extensions/llvm-org.lldb-vscode-0.1.0/bin
-$ cp /path/to/a/built/lldb-vscode .
-$ cp /path/to/a/built/liblldb.so .
+Installing the plug-in involves creating a directory in any location outside of
+`~/.vscode/extensions`. For example, `~/vscode-lldb` is a valid one. You'll also
+need a subfolder `bin`, e.g. `~/vscode-lldb/bin`. Then copy the `package.json`
+file that is in the same directory as this documentation into it, and symlink
+the `lldb-vscode` binary into the `bin` directory inside the plug-in directory.
+
+Finally, on VS Code, execute the command
+`Developer: Install Extension from Location` and pick the folder you just
+created, which would be `~/vscode-lldb` following the example above.
+
+If you want to make a stand alone plug-in that you can send to others on UNIX
+systems:
+
+```bash
+mkdir -p ~/llvm-org.lldb-vscode-0.1.0/bin
+cp package.json ~/llvm-org.lldb-vscode-0.1.0
+cd ~/llvm-org.lldb-vscode-0.1.0/bin
+cp /path/to/a/built/lldb-vscode .
+cp /path/to/a/built/liblldb.so .
 ```
 
-It is important to note that the directory `~/.vscode/extensions` works for users logged in locally to the machine. If you are remoting into the box using Visual Studio Code's Remote plugins (SSH, WSL, Docker) it will look for extensions on `~/.vscode-server/extensions` only and you will not see your just installed lldb-vscode plug-in. If you want this plugin to be visible to remoting users, you will need to either repeat the process above for the `~/.vscode-server` folder or create a symbolic link from it to `~/.vscode/extensions`:
+If you want to make a stand alone plug-in that you can send to others on macOS
+systems:
 
-```
-$ cd ~/.vscode-server/extensions
-$ ln -s ~/.vscode/extensions/llvm-org.lldb-vscode-0.1.0  llvm-org.lldb-vscode-0.1.0
+```bash
+mkdir -p ~/llvm-org.lldb-vscode-0.1.0/bin
+cp package.json ~/llvm-org.lldb-vscode-0.1.0
+cd ~/llvm-org.lldb-vscode-0.1.0/bin
+cp /path/to/a/built/lldb-vscode .
+rsync -av /path/to/a/built/LLDB.framework LLDB.framework
 ```
 
-If you want to make a stand alone plug-in that you can send to others on macOS systems:
-
-```
-$ mkdir -p ~/.vscode/extensions/llvm-org.lldb-vscode-0.1.0/bin
-$ cp package.json ~/.vscode/extensions/llvm-org.lldb-vscode-0.1.0
-$ cd ~/.vscode/extensions/llvm-org.lldb-vscode-0.1.0/bin
-$ cp /path/to/a/built/lldb-vscode .
-$ rsync -av /path/to/a/built/LLDB.framework LLDB.framework
+You might need to create additional directories for the `liblldb.so` or
+`LLDB.framework` inside or next to the `bin` folder depending on how the
+[rpath](https://en.wikipedia.org/wiki/Rpath) is set in your `lldb-vscode`
+binary. By default the `Debug` builds of LLDB usually includes
+the current executable directory in the rpath, so these steps should work for
+most people.
+
+To create a plug-in that symlinks into your `lldb-vscode` in your build
+directory:
+
+```bash
+mkdir -p ~/llvm-org.lldb-vscode-0.1.0/bin
+cp package.json ~/llvm-org.lldb-vscode-0.1.0
+cd ~/llvm-org.lldb-vscode-0.1.0/bin
+ln -s /path/to/a/built/lldb-vscode
 ```
 
-You might need to create additional directories for the `liblldb.so` or `LLDB.framework` inside or next to the `bin` folder depending on how the [rpath](https://en.wikipedia.org/wiki/Rpath) is set in your `lldb-vscode` binary. By default the `Debug` builds of LLDB usually includes
-the current executable directory in the rpath, so these steps should work for most people.
-
-To create a plug-in that symlinks into your `lldb-vscode` in your build directory:
-
-```
-$ mkdir -p ~/.vscode/extensions/llvm-org.lldb-vscode-0.1.0/bin
-$ cp package.json ~/.vscode/extensions/llvm-org.lldb-vscode-0.1.0
-$ cd ~/.vscode/extensions/llvm-org.lldb-vscode-0.1.0/bin
-$ ln -s /path/to/a/built/lldb-vscode
-```
-
-This is handy if you want to debug and develope the `lldb-vscode` executable when adding features or fixing bugs.
-
-
+This is handy if you want to debug and develope the `lldb-vscode` executable
+when adding features or fixing bugs.
 
 # Configurations
 
@@ -127,7 +134,6 @@ The JSON configuration file can contain the following `lldb-vscode` specific lau
 |**terminateCommands** |[string]| | LLDB commands executed when the debugging session ends. Commands and command output will be sent to the debugger console when they are executed.
 |**attachCommands** |[string]| | LLDB commands that will be executed after **preRunCommands** which take place of the code that normally does the attach. The commands can create a new target and attach or launch it however desired. This allows custom launch and attach configurations. Core files can use `target create --core /path/to/core` to attach to core files.
 
-
 ## Example configurations
 
 ### Launching
@@ -191,7 +197,6 @@ to be launched you can add the "waitFor" key value pair:
 This will work as long as the architecture, vendor and OS supports waiting
 for processes. Currently MacOS is the only platform that supports this.
 
-
 ### Loading a Core File
 
 This loads the coredump file `/cores/123.core` associated with the program
@@ -242,12 +247,12 @@ This will launch a server and then request a child debug session for a client.
 Inspect or adjust the behavior of lldb-vscode repl evaluation requests. The
 supported modes are `variable`, `command` and `auto`.
 
-*  `variable` - Variable mode expressions are evaluated in the context of the
+- `variable` - Variable mode expressions are evaluated in the context of the
    current frame. Use a `\`` prefix on the command to run an lldb command.
-*  `command` - Command mode expressions are evaluated as lldb commands, as a
+- `command` - Command mode expressions are evaluated as lldb commands, as a
    result, values printed by lldb are always stringified representations of the
    expression output.
-*  `auto` - Auto mode will attempt to infer if the expression represents an lldb
+- `auto` - Auto mode will attempt to infer if the expression represents an lldb
    command or a variable expression. A heuristic is used to infer if the input
    represents a variable or a command. Use a `\`` prefix to ensure an expression
    is evaluated as a command.

>From fc3541a2d9f8973fae38199cc607ef3232400187 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolasvasilache at users.noreply.github.com>
Date: Wed, 11 Oct 2023 14:56:09 -0700
Subject: [PATCH 36/38] [mlir][Transform] Create a transform interpreter and a
 preloader pass (#68661)

This revision provides the ability to use an arbitrary named sequence op
as
the entry point to a transform dialect strategy.

It is also a step towards better transform dialect usage in pass
pipelines
that need to preload a transform library rather thanparse it on the fly.

The interpreter itself is significantly simpler than its testing
counterpart
by avoiding payload/debug root tags and multiple shared modules.

In the process, the NamedSequenceOp::apply function is adapted to allow
it
being an entry point.

NamedSequenceOp is **not** extended to take the PossibleTopLevelTrait at
this
time, because the implementation of the trait is specific to allowing
one
top-level dangling op with a region such as SequenceOp or
AlternativesOp.
In particular, the verifier of PossibleTopLevelTrait does not allow for
an
empty body, which is necessary to declare a NamedSequenceOp that gets
linked
in separately before application.

In the future, we should dispense with the PossibleTopLevelTrait
altogether
and always enter the interpreter with a NamedSequenceOp.

Lastly, relevant TD linking utilities are moved to
TransformInterpreterUtils
and reused from there.
---
 .../Dialect/Transform/Transforms/Passes.td    |  31 +++++
 .../Transforms/TransformInterpreterUtils.h    |  18 +++
 .../Transform/IR/TransformInterfaces.cpp      |   6 +-
 .../lib/Dialect/Transform/IR/TransformOps.cpp |  16 ++-
 .../Transform/Transforms/CMakeLists.txt       |   2 +
 .../Transform/Transforms/InterpreterPass.cpp  |  51 +++++++
 .../Transforms/PreloadLibraryPass.cpp         |  39 ++++++
 .../TransformInterpreterPassBase.cpp          |  53 --------
 .../Transforms/TransformInterpreterUtils.cpp  | 124 ++++++++++++++++--
 ...=> lower-to-llvm-e2e-with-target-tag.mlir} |   0
 .../Transform/Library/lower-to-llvm.mlir      |  48 +++++++
 .../mlir/test/Dialect/BUILD.bazel             |   1 +
 12 files changed, 320 insertions(+), 69 deletions(-)
 create mode 100644 mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
 create mode 100644 mlir/lib/Dialect/Transform/Transforms/PreloadLibraryPass.cpp
 rename mlir/test/Dialect/LLVM/{lower-to-llvm-e2e.mlir => lower-to-llvm-e2e-with-target-tag.mlir} (100%)
 create mode 100644 mlir/test/Dialect/Transform/Library/lower-to-llvm.mlir

diff --git a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
index 2400066c8ad8c8b..c900fee76b814d3 100644
--- a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
@@ -42,4 +42,35 @@ def InferEffectsPass : Pass<"transform-infer-effects"> {
   }];
 }
 
+def PreloadLibraryPass : Pass<"transform-preload-library"> {
+  let summary = "preload transform dialect library";
+  let description = [{
+    This pass preloads a transform library and makes it available to subsequent
+    transform interpreter passes. The preloading occurs into the Transform
+    dialect and thus provides very limited functionality that does not scale.
+
+    Warning: Only a single such pass should exist for a given MLIR context.
+    This is a temporary solution until a resource-based solution is available.
+    TODO: investigate using a resource blob if some ownership mode allows it.
+  }];
+  let options = [
+    ListOption<"transformLibraryPaths", "transform-library-paths", "std::string",
+    "Optional paths to files with modules that should be merged into the "
+    "transform module to provide the definitions of external named sequences.">
+  ];
+}
+
+def InterpreterPass : Pass<"transform-interpreter"> {
+  let summary = "transform dialect interpreter";
+  let description = [{
+    This pass runs the transform dialect interpreter and applies the named
+    sequence transformation specified by the provided name (defaults to
+    `__transform_main`).
+  }];
+  let options = [
+    Option<"entryPoint", "entry-point", "std::string",
+           /*default=*/[{"__transform_main"}],
+           "Entry point of the pass pipeline.">,
+  ];
+}
 #endif // MLIR_DIALECT_TRANSFORM_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h b/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h
index 36c80e6fd61d3c1..3fc02267f26e9da 100644
--- a/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h
+++ b/mlir/include/mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h
@@ -26,6 +26,16 @@ class Region;
 
 namespace transform {
 namespace detail {
+
+/// Expands the given list of `paths` to a list of `.mlir` files.
+///
+/// Each entry in `paths` may either be a regular file, in which case it ends up
+/// in the result list, or a directory, in which case all (regular) `.mlir`
+/// files in that directory are added. Any other file types lead to a failure.
+LogicalResult expandPathsToMLIRFiles(ArrayRef<std::string> &paths,
+                                     MLIRContext *context,
+                                     SmallVectorImpl<std::string> &fileNames);
+
 /// Utility to parse and verify the content of a `transformFileName` MLIR file
 /// containing a transform dialect specification.
 LogicalResult
@@ -33,6 +43,14 @@ parseTransformModuleFromFile(MLIRContext *context,
                              llvm::StringRef transformFileName,
                              OwningOpRef<ModuleOp> &transformModule);
 
+/// Utility to parse, verify, aggregate and link the content of all mlir files
+/// nested under `transformLibraryPaths` and containing transform dialect
+/// specifications.
+LogicalResult
+assembleTransformLibraryFromPaths(MLIRContext *context,
+                                  ArrayRef<std::string> transformLibraryPaths,
+                                  OwningOpRef<ModuleOp> &transformModule);
+
 /// Utility to load a transform interpreter `module` from a module that has
 /// already been preloaded in the context.
 /// This mode is useful in cases where explicit parsing of a transform library
diff --git a/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp b/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp
index 4f88b8522e54c80..8f860c4e54f41d5 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformInterfaces.cpp
@@ -60,9 +60,9 @@ ArrayRef<Operation *>
 transform::TransformState::getPayloadOpsView(Value value) const {
   const TransformOpMapping &operationMapping = getMapping(value).direct;
   auto iter = operationMapping.find(value);
-  assert(
-      iter != operationMapping.end() &&
-      "cannot find mapping for payload handle (param/value handle provided?)");
+  assert(iter != operationMapping.end() &&
+         "cannot find mapping for payload handle (param/value handle "
+         "provided?)");
   return iter->getSecond();
 }
 
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index f8eb20e1a1da9c6..7b1badd0adae9ff 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -1761,8 +1761,20 @@ DiagnosedSilenceableFailure
 transform::NamedSequenceOp::apply(transform::TransformRewriter &rewriter,
                                   transform::TransformResults &results,
                                   transform::TransformState &state) {
-  // Nothing to do here.
-  return DiagnosedSilenceableFailure::success();
+  if (isExternal())
+    return emitDefiniteFailure() << "unresolved external named sequence";
+
+  // Map the entry block argument to the list of operations.
+  // Note: this is the same implementation as PossibleTopLevelTransformOp but
+  // without attaching the interface / trait since that is tailored to a
+  // dangling top-level op that does not get "called".
+  auto scope = state.make_region_scope(getBody());
+  if (failed(detail::mapPossibleTopLevelTransformOpBlockArguments(
+          state, this->getOperation(), getBody())))
+    return DiagnosedSilenceableFailure::definiteFailure();
+
+  return applySequenceBlock(getBody().front(),
+                            FailurePropagationMode::Propagate, state, results);
 }
 
 void transform::NamedSequenceOp::getEffects(
diff --git a/mlir/lib/Dialect/Transform/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Transform/Transforms/CMakeLists.txt
index 8774a8b86fb0d91..f0f57874f5e7032 100644
--- a/mlir/lib/Dialect/Transform/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Transform/Transforms/CMakeLists.txt
@@ -1,6 +1,8 @@
 add_mlir_dialect_library(MLIRTransformDialectTransforms
   CheckUses.cpp
   InferEffects.cpp
+  InterpreterPass.cpp
+  PreloadLibraryPass.cpp
   TransformInterpreterPassBase.cpp
   TransformInterpreterUtils.cpp
 
diff --git a/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp b/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
new file mode 100644
index 000000000000000..f473d5aa728c519
--- /dev/null
+++ b/mlir/lib/Dialect/Transform/Transforms/InterpreterPass.cpp
@@ -0,0 +1,51 @@
+//===- InterpreterPass.cpp - Transform dialect interpreter pass -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/Transforms/Passes.h"
+#include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
+
+using namespace mlir;
+
+namespace mlir {
+namespace transform {
+#define GEN_PASS_DEF_INTERPRETERPASS
+#include "mlir/Dialect/Transform/Transforms/Passes.h.inc"
+} // namespace transform
+} // namespace mlir
+
+namespace {
+class InterpreterPass
+    : public transform::impl::InterpreterPassBase<InterpreterPass> {
+public:
+  using Base::Base;
+
+  LogicalResult initialize(MLIRContext *context) override {
+    // TODO: investigate using a resource blob if some ownership mode allows it.
+    transformModule = transform::detail::getPreloadedTransformModule(context);
+    return success();
+  }
+
+  void runOnOperation() override {
+    if (failed(transform::applyTransformNamedSequence(
+            getOperation(), transformModule,
+            options.enableExpensiveChecks(true), entryPoint)))
+      return signalPassFailure();
+  }
+
+private:
+  /// Transform interpreter options.
+  transform::TransformOptions options;
+
+  /// The separate transform module to be used for transformations, shared
+  /// across multiple instances of the pass if it is applied in parallel to
+  /// avoid potentially expensive cloning. MUST NOT be modified after the pass
+  /// has been initialized.
+  ModuleOp transformModule;
+};
+} // namespace
diff --git a/mlir/lib/Dialect/Transform/Transforms/PreloadLibraryPass.cpp b/mlir/lib/Dialect/Transform/Transforms/PreloadLibraryPass.cpp
new file mode 100644
index 000000000000000..d2e7108c0288623
--- /dev/null
+++ b/mlir/lib/Dialect/Transform/Transforms/PreloadLibraryPass.cpp
@@ -0,0 +1,39 @@
+//===- PreloadLibraryPass.cpp - Pass to preload a transform library -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/Transforms/Passes.h"
+#include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
+
+using namespace mlir;
+
+namespace mlir {
+namespace transform {
+#define GEN_PASS_DEF_PRELOADLIBRARYPASS
+#include "mlir/Dialect/Transform/Transforms/Passes.h.inc"
+} // namespace transform
+} // namespace mlir
+
+namespace {
+class PreloadLibraryPass
+    : public transform::impl::PreloadLibraryPassBase<PreloadLibraryPass> {
+public:
+  using Base::Base;
+
+  void runOnOperation() override {
+    OwningOpRef<ModuleOp> mergedParsedLibraries;
+    if (failed(transform::detail::assembleTransformLibraryFromPaths(
+            &getContext(), transformLibraryPaths, mergedParsedLibraries)))
+      return signalPassFailure();
+    // TODO: investigate using a resource blob if some ownership mode allows it.
+    auto *dialect =
+        getContext().getOrLoadDialect<transform::TransformDialect>();
+    dialect->registerLibraryModule(std::move(mergedParsedLibraries));
+  }
+};
+} // namespace
diff --git a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp
index 5f35b6789dc94fe..538c81fe39fddb2 100644
--- a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterPassBase.cpp
@@ -357,59 +357,6 @@ LogicalResult transform::detail::interpreterBaseRunOnOperationImpl(
                          extraMappings, options);
 }
 
-/// Expands the given list of `paths` to a list of `.mlir` files.
-///
-/// Each entry in `paths` may either be a regular file, in which case it ends up
-/// in the result list, or a directory, in which case all (regular) `.mlir`
-/// files in that directory are added. Any other file types lead to a failure.
-static LogicalResult
-expandPathsToMLIRFiles(ArrayRef<std::string> &paths, MLIRContext *const context,
-                       SmallVectorImpl<std::string> &fileNames) {
-  for (const std::string &path : paths) {
-    auto loc = FileLineColLoc::get(context, path, 0, 0);
-
-    if (llvm::sys::fs::is_regular_file(path)) {
-      LLVM_DEBUG(DBGS() << "Adding '" << path << "' to list of files\n");
-      fileNames.push_back(path);
-      continue;
-    }
-
-    if (!llvm::sys::fs::is_directory(path)) {
-      return emitError(loc)
-             << "'" << path << "' is neither a file nor a directory";
-    }
-
-    LLVM_DEBUG(DBGS() << "Looking for files in '" << path << "':\n");
-
-    std::error_code ec;
-    for (llvm::sys::fs::directory_iterator it(path, ec), itEnd;
-         it != itEnd && !ec; it.increment(ec)) {
-      const std::string &fileName = it->path();
-
-      if (it->type() != llvm::sys::fs::file_type::regular_file) {
-        LLVM_DEBUG(DBGS() << "  Skipping non-regular file '" << fileName
-                          << "'\n");
-        continue;
-      }
-
-      if (!StringRef(fileName).endswith(".mlir")) {
-        LLVM_DEBUG(DBGS() << "  Skipping '" << fileName
-                          << "' because it does not end with '.mlir'\n");
-        continue;
-      }
-
-      LLVM_DEBUG(DBGS() << "  Adding '" << fileName << "' to list of files\n");
-      fileNames.push_back(fileName);
-    }
-
-    if (ec)
-      return emitError(loc) << "error while opening files in '" << path
-                            << "': " << ec.message();
-  }
-
-  return success();
-}
-
 LogicalResult transform::detail::interpreterBaseInitializeImpl(
     MLIRContext *context, StringRef transformFileName,
     ArrayRef<std::string> transformLibraryPaths,
diff --git a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp
index 1a6ebdd16232e8a..41feffffaf97b3f 100644
--- a/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp
+++ b/mlir/lib/Dialect/Transform/Transforms/TransformInterpreterUtils.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -31,6 +32,59 @@ using namespace mlir;
 #define DEBUG_TYPE "transform-dialect-interpreter-utils"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE << "]: ")
 
+/// Expands the given list of `paths` to a list of `.mlir` files.
+///
+/// Each entry in `paths` may either be a regular file, in which case it ends up
+/// in the result list, or a directory, in which case all (regular) `.mlir`
+/// files in that directory are added. Any other file types lead to a failure.
+LogicalResult transform::detail::expandPathsToMLIRFiles(
+    ArrayRef<std::string> &paths, MLIRContext *context,
+    SmallVectorImpl<std::string> &fileNames) {
+  for (const std::string &path : paths) {
+    auto loc = FileLineColLoc::get(context, path, 0, 0);
+
+    if (llvm::sys::fs::is_regular_file(path)) {
+      LLVM_DEBUG(DBGS() << "Adding '" << path << "' to list of files\n");
+      fileNames.push_back(path);
+      continue;
+    }
+
+    if (!llvm::sys::fs::is_directory(path)) {
+      return emitError(loc)
+             << "'" << path << "' is neither a file nor a directory";
+    }
+
+    LLVM_DEBUG(DBGS() << "Looking for files in '" << path << "':\n");
+
+    std::error_code ec;
+    for (llvm::sys::fs::directory_iterator it(path, ec), itEnd;
+         it != itEnd && !ec; it.increment(ec)) {
+      const std::string &fileName = it->path();
+
+      if (it->type() != llvm::sys::fs::file_type::regular_file) {
+        LLVM_DEBUG(DBGS() << "  Skipping non-regular file '" << fileName
+                          << "'\n");
+        continue;
+      }
+
+      if (!StringRef(fileName).endswith(".mlir")) {
+        LLVM_DEBUG(DBGS() << "  Skipping '" << fileName
+                          << "' because it does not end with '.mlir'\n");
+        continue;
+      }
+
+      LLVM_DEBUG(DBGS() << "  Adding '" << fileName << "' to list of files\n");
+      fileNames.push_back(fileName);
+    }
+
+    if (ec)
+      return emitError(loc) << "error while opening files in '" << path
+                            << "': " << ec.message();
+  }
+
+  return success();
+}
+
 LogicalResult transform::detail::parseTransformModuleFromFile(
     MLIRContext *context, llvm::StringRef transformFileName,
     OwningOpRef<ModuleOp> &transformModule) {
@@ -91,10 +145,50 @@ transform::detail::findTransformEntryPoint(Operation *root, ModuleOp module,
   return nullptr;
 }
 
-/// Return whether `func1` can be merged into `func2`. For that to work `func1`
-/// has to be a declaration (aka has to be external) and `func2` either has to
-/// be a declaration as well, or it has to be public (otherwise, it wouldn't
-/// be visible by `func1`).
+LogicalResult transform::detail::assembleTransformLibraryFromPaths(
+    MLIRContext *context, ArrayRef<std::string> transformLibraryPaths,
+    OwningOpRef<ModuleOp> &transformModule) {
+  // Assemble list of library files.
+  SmallVector<std::string> libraryFileNames;
+  if (failed(detail::expandPathsToMLIRFiles(transformLibraryPaths, context,
+                                            libraryFileNames)))
+    return failure();
+
+  // Parse modules from library files.
+  SmallVector<OwningOpRef<ModuleOp>> parsedLibraries;
+  for (const std::string &libraryFileName : libraryFileNames) {
+    OwningOpRef<ModuleOp> parsedLibrary;
+    if (failed(transform::detail::parseTransformModuleFromFile(
+            context, libraryFileName, parsedLibrary)))
+      return failure();
+    parsedLibraries.push_back(std::move(parsedLibrary));
+  }
+
+  // Merge parsed libraries into one module.
+  auto loc = FileLineColLoc::get(context, "<shared-library-module>", 0, 0);
+  OwningOpRef<ModuleOp> mergedParsedLibraries =
+      ModuleOp::create(loc, "__transform");
+  {
+    mergedParsedLibraries.get()->setAttr("transform.with_named_sequence",
+                                         UnitAttr::get(context));
+    IRRewriter rewriter(context);
+    // TODO: extend `mergeSymbolsInto` to support multiple `other` modules.
+    for (OwningOpRef<ModuleOp> &parsedLibrary : parsedLibraries) {
+      if (failed(transform::detail::mergeSymbolsInto(
+              mergedParsedLibraries.get(), std::move(parsedLibrary))))
+        return mergedParsedLibraries->emitError()
+               << "failed to verify merged transform module";
+    }
+  }
+
+  transformModule = std::move(mergedParsedLibraries);
+  return success();
+}
+
+/// Return whether `func1` can be merged into `func2`. For that to work
+/// `func1` has to be a declaration (aka has to be external) and `func2`
+/// either has to be a declaration as well, or it has to be public (otherwise,
+/// it wouldn't be visible by `func1`).
 static bool canMergeInto(FunctionOpInterface func1, FunctionOpInterface func2) {
   return func1.isExternal() && (func2.isPublic() || func2.isExternal());
 }
@@ -281,8 +375,9 @@ transform::detail::mergeSymbolsInto(Operation *target,
       auto collidingFuncOp =
           cast<FunctionOpInterface>(collidingOp.getOperation());
 
-      // Both ops are in the target module now and can be treated symmetrically,
-      // so w.l.o.g. we can reduce to merging `funcOp` into `collidingFuncOp`.
+      // Both ops are in the target module now and can be treated
+      // symmetrically, so w.l.o.g. we can reduce to merging `funcOp` into
+      // `collidingFuncOp`.
       if (!canMergeInto(funcOp, collidingFuncOp)) {
         std::swap(funcOp, collidingFuncOp);
       }
@@ -317,18 +412,25 @@ LogicalResult transform::applyTransformNamedSequence(
     const TransformOptions &options, StringRef entryPoint) {
   Operation *transformRoot =
       detail::findTransformEntryPoint(payload, transformModule, entryPoint);
-  if (!transformRoot)
-    return failure();
+  if (!transformRoot) {
+    return payload->emitError()
+           << "could not find transform entry point: " << entryPoint
+           << " in either payload or transform module";
+  }
 
   // `transformModule` may not be modified.
-  OwningOpRef<Operation *> clonedTransformModule(transformModule->clone());
   if (transformModule && !transformModule->isAncestor(transformRoot)) {
+    OwningOpRef<Operation *> clonedTransformModule(transformModule->clone());
     if (failed(detail::mergeSymbolsInto(
             SymbolTable::getNearestSymbolTable(transformRoot),
-            std::move(clonedTransformModule))))
-      return failure();
+            std::move(clonedTransformModule)))) {
+      return payload->emitError() << "failed to merge symbols";
+    }
   }
 
+  LLVM_DEBUG(DBGS() << "Apply\n" << *transformRoot << "\n");
+  LLVM_DEBUG(DBGS() << "To\n" << *payload << "\n");
+
   // Apply the transform to the IR, do not enforce top-level constraints.
   RaggedArray<MappedValue> noExtraMappings;
   return applyTransforms(payload, cast<TransformOpInterface>(transformRoot),
diff --git a/mlir/test/Dialect/LLVM/lower-to-llvm-e2e.mlir b/mlir/test/Dialect/LLVM/lower-to-llvm-e2e-with-target-tag.mlir
similarity index 100%
rename from mlir/test/Dialect/LLVM/lower-to-llvm-e2e.mlir
rename to mlir/test/Dialect/LLVM/lower-to-llvm-e2e-with-target-tag.mlir
diff --git a/mlir/test/Dialect/Transform/Library/lower-to-llvm.mlir b/mlir/test/Dialect/Transform/Library/lower-to-llvm.mlir
new file mode 100644
index 000000000000000..0ba50bd2362b34d
--- /dev/null
+++ b/mlir/test/Dialect/Transform/Library/lower-to-llvm.mlir
@@ -0,0 +1,48 @@
+// RUN: mlir-opt %s
+
+/// Schedule to lower to LLVM.
+module @lower_module_to_llvm attributes { transform.with_named_sequence } {
+
+transform.named_sequence @lower_to_llvm(
+    %module: !transform.any_op {transform.readonly}) -> !transform.any_op {
+
+  %func = transform.structured.match ops{["func.func"]} in %module : (!transform.any_op) -> !transform.any_op
+  %f = transform.apply_registered_pass "convert-vector-to-scf" to %func : (!transform.any_op) -> !transform.any_op
+  %f2 = transform.apply_registered_pass "convert-linalg-to-loops" to %f : (!transform.any_op) -> !transform.any_op
+  %f3 = transform.apply_registered_pass "convert-scf-to-cf" to %f2 : (!transform.any_op) -> !transform.any_op
+  %f4 = transform.apply_registered_pass "expand-strided-metadata" to %f3 : (!transform.any_op) -> !transform.any_op
+  %f5 = transform.apply_registered_pass "lower-affine" to %f4 : (!transform.any_op) -> !transform.any_op
+
+  transform.apply_conversion_patterns to %f5 {
+    transform.apply_conversion_patterns.dialect_to_llvm "math"
+    transform.apply_conversion_patterns.vector.vector_to_llvm
+    transform.apply_conversion_patterns.dialect_to_llvm "memref"
+    transform.apply_conversion_patterns.func.func_to_llvm
+    transform.apply_conversion_patterns.dialect_to_llvm "index"
+    transform.apply_conversion_patterns.dialect_to_llvm "arith"
+    transform.apply_conversion_patterns.dialect_to_llvm "cf"
+  } with type_converter {
+    transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter
+      {index_bitwidth = 64,
+       use_bare_ptr = false,
+       use_bare_ptr_memref_call_conv = false,
+       use_opaque_pointers = true}
+  } {
+    legal_dialects = ["llvm"],
+    partial_conversion
+  } : !transform.any_op
+
+  // Need to rematch here because:
+  //   1. applying reconcile-unrealized-casts on the whole module yields the
+  //      transform applies to transform, when called from a named sequence, at
+  //      this time.
+  //   2. apply_conversion patterns consumes the func but does not produce 
+  //      a new llvm.func.
+  %f6 = transform.structured.match ops{["llvm.func"]} in %module 
+    : (!transform.any_op) -> !transform.any_op
+  %f7 = transform.apply_registered_pass "reconcile-unrealized-casts" to %f6
+    : (!transform.any_op) -> !transform.any_op
+  transform.yield %module : !transform.any_op
+}
+
+} // transform module
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/Dialect/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/Dialect/BUILD.bazel
index 061404078bc32ca..afed5f7fe1d4f08 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/Dialect/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/Dialect/BUILD.bazel
@@ -21,6 +21,7 @@ package(default_visibility = ["//visibility:public"])
             "Transform/*-source.mlir",
             "Transform/*-symbol-def.mlir",
             "Transform/*-symbol-decl-and-schedule.mlir",
+            "Transform/Library/*.mlir",
             "Transform/test-interpreter-library/*.mlir",
         ]),
     )

>From 543111b766c0ac244309b20456f4f4c6e2ba536a Mon Sep 17 00:00:00 2001
From: weiguozhi <57237827+weiguozhi at users.noreply.github.com>
Date: Wed, 11 Oct 2023 14:57:15 -0700
Subject: [PATCH 37/38] [RA] Disable split around hint register if optimize for
 size (#68619)

Split a virtual register with hint may generate COPY instructions in
multiple cold basic blocks, and increase code size. So disable this
split when the function is optimized for size.
---
 llvm/lib/CodeGen/RegAllocGreedy.cpp       |  6 ++
 llvm/test/CodeGen/ARM/thumb2-size-opt.ll  |  4 +-
 llvm/test/CodeGen/X86/no-split-size.ll    | 90 +++++++++++++++++++++++
 llvm/test/DebugInfo/ARM/sdag-split-arg.ll |  4 +-
 4 files changed, 100 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/no-split-size.ll

diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 4d476924a7dbf7b..349d8b0975f3a10 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -1217,6 +1217,12 @@ bool RAGreedy::trySplitAroundHintReg(MCPhysReg Hint,
                                      const LiveInterval &VirtReg,
                                      SmallVectorImpl<Register> &NewVRegs,
                                      AllocationOrder &Order) {
+  // Split the VirtReg may generate COPY instructions in multiple cold basic
+  // blocks, and increase code size. So we avoid it when the function is
+  // optimized for size.
+  if (MF->getFunction().hasOptSize())
+    return false;
+
   // Don't allow repeated splitting as a safe guard against looping.
   if (ExtraInfo->getStage(VirtReg) >= RS_Split2)
     return false;
diff --git a/llvm/test/CodeGen/ARM/thumb2-size-opt.ll b/llvm/test/CodeGen/ARM/thumb2-size-opt.ll
index 8cf7a702e8ed54d..f9f29fc064a20ce 100644
--- a/llvm/test/CodeGen/ARM/thumb2-size-opt.ll
+++ b/llvm/test/CodeGen/ARM/thumb2-size-opt.ll
@@ -85,8 +85,8 @@ entry:
 
 define i32 @bundled_instruction(ptr %addr, ptr %addr2, i1 %tst) minsize {
 ; CHECK-LABEL: bundled_instruction:
-; CHECK: iteee ne
-; CHECK: ldmeq r2!, {{{r[0-9]+}}}
+; CHECK: itee ne
+; CHECK: ldmeq r3!, {{{r[0-9]+}}}
   br i1 %tst, label %true, label %false
 
 true:
diff --git a/llvm/test/CodeGen/X86/no-split-size.ll b/llvm/test/CodeGen/X86/no-split-size.ll
new file mode 100644
index 000000000000000..c1f93acd77dee27
--- /dev/null
+++ b/llvm/test/CodeGen/X86/no-split-size.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+
+; @foo is optimized for size. Variables %p2, %p3, %p4, %p5 and %p6 are not split
+; in cold blocks.
+
+define i64 @foo(ptr %ptr, i64 %p2, i64 %p3, i64 %p4, i64 %p5, i64 %p6) optsize {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    pushq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    pushq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset %rbx, -48
+; CHECK-NEXT:    .cfi_offset %r12, -40
+; CHECK-NEXT:    .cfi_offset %r13, -32
+; CHECK-NEXT:    .cfi_offset %r14, -24
+; CHECK-NEXT:    .cfi_offset %r15, -16
+; CHECK-NEXT:    movq %r9, %r14
+; CHECK-NEXT:    movq %r8, %rbx
+; CHECK-NEXT:    movq %rcx, %r12
+; CHECK-NEXT:    movq %rdx, %r15
+; CHECK-NEXT:    movq %rsi, %r13
+; CHECK-NEXT:    testq %rdi, %rdi
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %if.else
+; CHECK-NEXT:    testq %r13, %r13
+; CHECK-NEXT:    movq %r15, %rax
+; CHECK-NEXT:    je .LBB0_3
+; CHECK-NEXT:  .LBB0_4: # %if.end
+; CHECK-NEXT:    addq %r13, %rax
+; CHECK-NEXT:    addq %r12, %r15
+; CHECK-NEXT:    addq %rax, %r15
+; CHECK-NEXT:    addq %r14, %rbx
+; CHECK-NEXT:    addq %r15, %rbx
+; CHECK-NEXT:    movq %rbx, %rax
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 40
+; CHECK-NEXT:    popq %r12
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    popq %r13
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %r14
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %r15
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_1: # %if.then
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    callq bar1 at PLT
+; CHECK-NEXT:    jmp .LBB0_4
+; CHECK-NEXT:  .LBB0_3: # %if.then2
+; CHECK-NEXT:    callq bar2 at PLT
+; CHECK-NEXT:    jmp .LBB0_4
+entry:
+  %tobool.not = icmp eq ptr %ptr, null
+  br i1 %tobool.not, label %if.then, label %if.else, !prof !5
+
+if.then:                                          ; preds = %entry
+  %call1 = call i64 @bar1()
+  br label %if.end
+
+if.else:
+  %cond = icmp eq i64 %p2, 0
+  br i1 %cond, label %if.then2, label %if.end, !prof !5
+
+if.then2:
+  %call2 = call i64 @bar2()
+  br label %if.end
+
+if.end:
+  %call = phi i64 [ %call1, %if.then ], [%call2, %if.then2], [ %p3, %if.else ]
+  %add1 = add i64 %call, %p2
+  %add2 = add i64 %add1, %p3
+  %add3 = add i64 %add2, %p4
+  %add4 = add i64 %add3, %p5
+  %res = add i64 %add4, %p6
+  ret i64 %res
+}
+
+!5 = !{!"branch_weights", i32 1, i32 2000}
+
+declare i64 @bar1()
+declare i64 @bar2()
diff --git a/llvm/test/DebugInfo/ARM/sdag-split-arg.ll b/llvm/test/DebugInfo/ARM/sdag-split-arg.ll
index de1d822a8c8015f..9699c102c0b76b8 100644
--- a/llvm/test/DebugInfo/ARM/sdag-split-arg.ll
+++ b/llvm/test/DebugInfo/ARM/sdag-split-arg.ll
@@ -19,8 +19,8 @@ target triple = "thumbv7k-apple-watchos2.0.0"
 ; Function Attrs: optsize ssp
 define i64 @_Z3foox(i64 returned) local_unnamed_addr #0 !dbg !13 {
   tail call void @llvm.dbg.value(metadata i64 %0, metadata !17, metadata !DIExpression()), !dbg !18
-  ; CHECK: @DEBUG_VALUE: foo:offset <- [DW_OP_LLVM_fragment 0 32] $r0
-  ; CHECK: @DEBUG_VALUE: foo:offset <- [DW_OP_LLVM_fragment 32 32] $r1
+  ; CHECK: @DEBUG_VALUE: foo:offset <- [DW_OP_LLVM_fragment 0 32] $r5
+  ; CHECK: @DEBUG_VALUE: foo:offset <- [DW_OP_LLVM_fragment 32 32] $r4
 
   %2 = load i64, ptr @g, align 8, !dbg !19, !tbaa !21
   %3 = icmp eq i64 %2, %0, !dbg !19

>From ffc0f0ffa4164f9b426e4b30897c86577e2aa867 Mon Sep 17 00:00:00 2001
From: eric <eric at efcs.ca>
Date: Thu, 12 Oct 2023 13:01:29 -0400
Subject: [PATCH 38/38] Fight with poor pre-commit check.

So aparently git clang-format doesn't format libc++ changes.
And yet, we have a check thet enforces clang formatting.

This needs to change
---
 libcxx/include/__config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/include/__config b/libcxx/include/__config
index 3b0a70931c9eec8..65ce6d6a27f8326 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -171,7 +171,7 @@
 // efficient size class by dropping the alignment requirements for std::string's
 // pointer from 16 to 8. This changes the output of std::string::max_size,
 // which makes it ABI breaking
-#   define _LIBCPP_ABI_STRING_8_BYTE_ALIGNMENT
+#    define _LIBCPP_ABI_STRING_8_BYTE_ALIGNMENT
 #  elif _LIBCPP_ABI_VERSION == 1
 #    if !(defined(_LIBCPP_OBJECT_FORMAT_COFF) || defined(_LIBCPP_OBJECT_FORMAT_XCOFF))
 // Enable compiling copies of now inline methods into the dylib to support